Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser_simd.hpp
Go to the documentation of this file.
1#pragma once
17#include "common.hpp"
18
19#if (defined(__AVX2__) || defined(__SSE2__)) && !defined(CSV_NO_SIMD)
20#include <immintrin.h>
21// _tzcnt_u32 in GCC/Clang headers is __attribute__(__target__("bmi")), which
22// requires -mbmi at the call site. __builtin_ctz has no such restriction and
23// emits BSF/TZCNT as the optimizer sees fit. MSVC's _tzcnt_u32 has no
24// equivalent restriction, so keep it there.
25# ifdef _MSC_VER
26# define CSV_TZCNT32(x) _tzcnt_u32(x)
27# else
28# define CSV_TZCNT32(x) static_cast<unsigned>(__builtin_ctz(x))
29# endif
30#endif
31
32namespace csv {
33 namespace internals {
34 // Precomputed SIMD broadcast vectors for the four CSV sentinel bytes.
35 // Constructed once per parser instance and passed by const-ref into
36 // find_next_non_special, amortizing broadcast cost across every field
37 // scan — meaningful for CSVs with many short fields.
38 //
39 // When no_quote mode is active, set quote_char = delimiter so that
40 // quote bytes are not mistakenly treated as sentinels (they are
41 // NOT_SPECIAL in that mode and must not cause SIMD to stop early).
42 struct SentinelVecs {
43 SentinelVecs() noexcept : SentinelVecs(',', '"') {}
44
45 SentinelVecs(char delimiter, char quote_char) noexcept {
46#if defined(__AVX2__) && !defined(CSV_NO_SIMD)
47 v_delim = _mm256_set1_epi8(delimiter);
48 v_quote = _mm256_set1_epi8(quote_char);
49 v_lf = _mm256_set1_epi8('\n');
50 v_cr = _mm256_set1_epi8('\r');
51#elif defined(__SSE2__) && !defined(CSV_NO_SIMD)
52 v_delim = _mm_set1_epi8(delimiter);
53 v_quote = _mm_set1_epi8(quote_char);
54 v_lf = _mm_set1_epi8('\n');
55 v_cr = _mm_set1_epi8('\r');
56#else
57 (void)delimiter; (void)quote_char;
58#endif
59 }
60
61#if defined(__AVX2__) && !defined(CSV_NO_SIMD)
62 __m256i v_delim, v_quote, v_lf, v_cr;
63#elif defined(__SSE2__) && !defined(CSV_NO_SIMD)
64 __m128i v_delim, v_quote, v_lf, v_cr;
65#endif
66 };
67
68 // Free function — easy to unit test independently of IBasicCSVParser.
69 //
70 // SIMD-only fast-forward: skips pos forward past any bytes that are
71 // definitely not one of the four CSV sentinel characters. Stops as
72 // soon as a sentinel byte is found OR fewer bytes remain than one
73 // SIMD lane. The caller is responsible for the scalar tail loop using
74 // compound_parse_flag, which correctly handles quote_escape state.
75 //
76 // State-agnostic by design: stops conservatively at any sentinel byte
77 // regardless of quote_escape. Inside a quoted field, delimiter and
78 // newline bytes are NOT_SPECIAL under compound_parse_flag, so the
79 // outer DFA loop re-enters parse_field immediately at zero cost.
80 inline size_t find_next_non_special(
82 size_t pos,
83 const SentinelVecs& sentinels
84 ) noexcept
85 {
86#if defined(__AVX2__) && !defined(CSV_NO_SIMD)
87 while (pos + 32 <= data.size()) {
88 __m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data.data() + pos));
89 __m256i special = _mm256_cmpeq_epi8(bytes, sentinels.v_delim);
90 special = _mm256_or_si256(special, _mm256_cmpeq_epi8(bytes, sentinels.v_quote));
91 special = _mm256_or_si256(special, _mm256_cmpeq_epi8(bytes, sentinels.v_lf));
92 special = _mm256_or_si256(special, _mm256_cmpeq_epi8(bytes, sentinels.v_cr));
93 int mask = _mm256_movemask_epi8(special);
94
95 if (mask != 0)
96 return pos + CSV_TZCNT32(static_cast<unsigned>(mask));
97 pos += 32;
98 }
99#elif defined(__SSE2__) && !defined(CSV_NO_SIMD)
100 while (pos + 16 <= data.size()) {
101 __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data.data() + pos));
102 __m128i special = _mm_cmpeq_epi8(bytes, sentinels.v_delim);
103 special = _mm_or_si128(special, _mm_cmpeq_epi8(bytes, sentinels.v_quote));
104 special = _mm_or_si128(special, _mm_cmpeq_epi8(bytes, sentinels.v_lf));
105 special = _mm_or_si128(special, _mm_cmpeq_epi8(bytes, sentinels.v_cr));
106 int mask = _mm_movemask_epi8(special);
107
108 if (mask != 0)
109 return pos + CSV_TZCNT32(static_cast<unsigned>(mask));
110 pos += 16;
111 }
112#else
113 (void)data; (void)sentinels;
114#endif
115 return pos;
116 }
117 }
118}
A standalone header file containing shared code.
The all encompassing namespace.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135