Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser_guessing.cpp
2#include "string_view_stream.hpp"
3
4#include <unordered_map>
5
6namespace csv {
7 namespace internals {
8 CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) {
9 // Frequency counter of row length
10 std::unordered_map<size_t, size_t> row_tally = { { 0, 0 } };
11
12 // Map row lengths to row num where they first occurred
13 std::unordered_map<size_t, size_t> row_when = { { 0, 0 } };
14
15 // Parse the CSV using the low-level constructor that takes pre-built flag
16 // tables — bypasses format resolution entirely and avoids recursion back
17 // into guess_format.
18 internals::StringViewStream source(head);
19 RowCollection rows;
20
21 const auto parse_flags = format.is_quoting_enabled()
22 ? internals::make_parse_flags(format.get_delim(), format.get_quote_char())
23 : internals::make_parse_flags(format.get_delim());
24 const auto ws_flags = internals::make_ws_flags(format.get_trim_chars());
25 StreamParser<internals::StringViewStream> parser(source, parse_flags, ws_flags);
26 parser.set_output(rows);
27 parser.next();
28
29 for (size_t i = 0; i < rows.size(); i++) {
30 auto& row = rows[i];
31
32 // Ignore zero-length rows
33 if (row.size() > 0) {
34 if (row_tally.find(row.size()) != row_tally.end()) {
35 row_tally[row.size()]++;
36 }
37 else {
38 row_tally[row.size()] = 1;
39 row_when[row.size()] = i;
40 }
41 }
42 }
43
44 double final_score = 0;
45 size_t header_row = 0;
46 size_t mode_row_length = 0;
47
48 // Final score is equal to the largest row size times rows of that size.
49 for (auto& pair : row_tally) {
50 const size_t row_size = pair.first;
51 const size_t row_count = pair.second;
52 const double score = (double)(row_size * row_count);
53 if (score > final_score) {
54 final_score = score;
55 mode_row_length = row_size;
56 header_row = row_when[row_size];
57 }
58 }
59
60 // Heuristic: If first row has >= columns than mode, use it as header.
61 size_t first_row_length = rows.size() > 0 ? rows[0].size() : 0;
62 if (first_row_length >= mode_row_length && first_row_length > 0) {
63 header_row = 0;
64 }
65
66 return { header_row, mode_row_length, final_score };
67 }
68
69 CSV_INLINE CSVGuessResult guess_format(csv::string_view head, const std::vector<char>& delims) {
76 CSVFormat format;
77 size_t max_score = 0;
78 size_t header = 0;
79 size_t n_cols = 0;
80 char current_delim = delims[0];
81
82 for (char cand_delim : delims) {
83 auto result = calculate_score(head, format.delimiter(cand_delim));
84
85 if ((size_t)result.score > max_score) {
86 max_score = (size_t)result.score;
87 current_delim = cand_delim;
88 header = result.header;
89 n_cols = result.mode_row_length;
90 }
91 }
92
93 return { current_delim, (int)header, n_cols };
94 }
95 }
96}
Contains the main CSV parsing algorithm and various utility functions.
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
CSVGuessResult guess_format(csv::string_view head, const std::vector< char > &delims={ ',', '|', '\t', ';', '^', '~' })
Guess the delimiter used by a delimiter-separated values file.
Stores information about how to parse a CSV file.
CSVFormat & delimiter(char delim)
Sets the delimiter of the CSV file.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
The all encompassing namespace.
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135
Stores the inferred format of a CSV file.