25 #ifndef CRYPTO3_STREAM_CHACHA_SSE2_IMPL_HPP
26 #define CRYPTO3_STREAM_CHACHA_SSE2_IMPL_HPP
28 #include <nil/crypto3/detail/config.hpp>
32 #include <emmintrin.h>
38 template<std::
size_t Round, std::
size_t IVSize, std::
size_t KeyBits>
55 inline static void chacha_x8(
const std::array<std::uint8_t, block_size * 8> &
block,
57 chacha_x4(
block, schedule);
65 const __m128i *input_mm =
reinterpret_cast<const __m128i *
>(schedule);
66 __m128i *output_mm =
reinterpret_cast<__m128i *
>(
block);
68 __m128i input0 = _mm_loadu_si128(input_mm);
69 __m128i input1 = _mm_loadu_si128(input_mm + 1);
70 __m128i input2 = _mm_loadu_si128(input_mm + 2);
71 __m128i input3 = _mm_loadu_si128(input_mm + 3);
75 #define mm_rotl(r, n) _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32 - (n)))
77 __m128i r0_0 = input0;
78 __m128i r0_1 = input1;
79 __m128i r0_2 = input2;
80 __m128i r0_3 = input3;
82 __m128i r1_0 = input0;
83 __m128i r1_1 = input1;
84 __m128i r1_2 = input2;
85 __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
87 __m128i r2_0 = input0;
88 __m128i r2_1 = input1;
89 __m128i r2_2 = input2;
90 __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
92 __m128i r3_0 = input0;
93 __m128i r3_1 = input1;
94 __m128i r3_2 = input2;
95 __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
97 for (
size_t r = 0; r !=
rounds / 2; ++r) {
98 r0_0 = _mm_add_epi32(r0_0, r0_1);
99 r1_0 = _mm_add_epi32(r1_0, r1_1);
100 r2_0 = _mm_add_epi32(r2_0, r2_1);
101 r3_0 = _mm_add_epi32(r3_0, r3_1);
103 r0_3 = _mm_xor_si128(r0_3, r0_0);
104 r1_3 = _mm_xor_si128(r1_3, r1_0);
105 r2_3 = _mm_xor_si128(r2_3, r2_0);
106 r3_3 = _mm_xor_si128(r3_3, r3_0);
113 r0_2 = _mm_add_epi32(r0_2, r0_3);
114 r1_2 = _mm_add_epi32(r1_2, r1_3);
115 r2_2 = _mm_add_epi32(r2_2, r2_3);
116 r3_2 = _mm_add_epi32(r3_2, r3_3);
118 r0_1 = _mm_xor_si128(r0_1, r0_2);
119 r1_1 = _mm_xor_si128(r1_1, r1_2);
120 r2_1 = _mm_xor_si128(r2_1, r2_2);
121 r3_1 = _mm_xor_si128(r3_1, r3_2);
128 r0_0 = _mm_add_epi32(r0_0, r0_1);
129 r1_0 = _mm_add_epi32(r1_0, r1_1);
130 r2_0 = _mm_add_epi32(r2_0, r2_1);
131 r3_0 = _mm_add_epi32(r3_0, r3_1);
133 r0_3 = _mm_xor_si128(r0_3, r0_0);
134 r1_3 = _mm_xor_si128(r1_3, r1_0);
135 r2_3 = _mm_xor_si128(r2_3, r2_0);
136 r3_3 = _mm_xor_si128(r3_3, r3_0);
143 r0_2 = _mm_add_epi32(r0_2, r0_3);
144 r1_2 = _mm_add_epi32(r1_2, r1_3);
145 r2_2 = _mm_add_epi32(r2_2, r2_3);
146 r3_2 = _mm_add_epi32(r3_2, r3_3);
148 r0_1 = _mm_xor_si128(r0_1, r0_2);
149 r1_1 = _mm_xor_si128(r1_1, r1_2);
150 r2_1 = _mm_xor_si128(r2_1, r2_2);
151 r3_1 = _mm_xor_si128(r3_1, r3_2);
158 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
159 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
160 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
162 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
163 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
164 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
166 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
167 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
168 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
170 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
171 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
172 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
174 r0_0 = _mm_add_epi32(r0_0, r0_1);
175 r1_0 = _mm_add_epi32(r1_0, r1_1);
176 r2_0 = _mm_add_epi32(r2_0, r2_1);
177 r3_0 = _mm_add_epi32(r3_0, r3_1);
179 r0_3 = _mm_xor_si128(r0_3, r0_0);
180 r1_3 = _mm_xor_si128(r1_3, r1_0);
181 r2_3 = _mm_xor_si128(r2_3, r2_0);
182 r3_3 = _mm_xor_si128(r3_3, r3_0);
189 r0_2 = _mm_add_epi32(r0_2, r0_3);
190 r1_2 = _mm_add_epi32(r1_2, r1_3);
191 r2_2 = _mm_add_epi32(r2_2, r2_3);
192 r3_2 = _mm_add_epi32(r3_2, r3_3);
194 r0_1 = _mm_xor_si128(r0_1, r0_2);
195 r1_1 = _mm_xor_si128(r1_1, r1_2);
196 r2_1 = _mm_xor_si128(r2_1, r2_2);
197 r3_1 = _mm_xor_si128(r3_1, r3_2);
204 r0_0 = _mm_add_epi32(r0_0, r0_1);
205 r1_0 = _mm_add_epi32(r1_0, r1_1);
206 r2_0 = _mm_add_epi32(r2_0, r2_1);
207 r3_0 = _mm_add_epi32(r3_0, r3_1);
209 r0_3 = _mm_xor_si128(r0_3, r0_0);
210 r1_3 = _mm_xor_si128(r1_3, r1_0);
211 r2_3 = _mm_xor_si128(r2_3, r2_0);
212 r3_3 = _mm_xor_si128(r3_3, r3_0);
219 r0_2 = _mm_add_epi32(r0_2, r0_3);
220 r1_2 = _mm_add_epi32(r1_2, r1_3);
221 r2_2 = _mm_add_epi32(r2_2, r2_3);
222 r3_2 = _mm_add_epi32(r3_2, r3_3);
224 r0_1 = _mm_xor_si128(r0_1, r0_2);
225 r1_1 = _mm_xor_si128(r1_1, r1_2);
226 r2_1 = _mm_xor_si128(r2_1, r2_2);
227 r3_1 = _mm_xor_si128(r3_1, r3_2);
234 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
235 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
236 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
238 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
239 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
240 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
242 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
243 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
244 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
246 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
247 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
248 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
251 r0_0 = _mm_add_epi32(r0_0, input0);
252 r0_1 = _mm_add_epi32(r0_1, input1);
253 r0_2 = _mm_add_epi32(r0_2, input2);
254 r0_3 = _mm_add_epi32(r0_3, input3);
256 r1_0 = _mm_add_epi32(r1_0, input0);
257 r1_1 = _mm_add_epi32(r1_1, input1);
258 r1_2 = _mm_add_epi32(r1_2, input2);
259 r1_3 = _mm_add_epi32(r1_3, input3);
260 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
262 r2_0 = _mm_add_epi32(r2_0, input0);
263 r2_1 = _mm_add_epi32(r2_1, input1);
264 r2_2 = _mm_add_epi32(r2_2, input2);
265 r2_3 = _mm_add_epi32(r2_3, input3);
266 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
268 r3_0 = _mm_add_epi32(r3_0, input0);
269 r3_1 = _mm_add_epi32(r3_1, input1);
270 r3_2 = _mm_add_epi32(r3_2, input2);
271 r3_3 = _mm_add_epi32(r3_3, input3);
272 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
274 _mm_storeu_si128(output_mm + 0, r0_0);
275 _mm_storeu_si128(output_mm + 1, r0_1);
276 _mm_storeu_si128(output_mm + 2, r0_2);
277 _mm_storeu_si128(output_mm + 3, r0_3);
279 _mm_storeu_si128(output_mm + 4, r1_0);
280 _mm_storeu_si128(output_mm + 5, r1_1);
281 _mm_storeu_si128(output_mm + 6, r1_2);
282 _mm_storeu_si128(output_mm + 7, r1_3);
284 _mm_storeu_si128(output_mm + 8, r2_0);
285 _mm_storeu_si128(output_mm + 9, r2_1);
286 _mm_storeu_si128(output_mm + 10, r2_2);
287 _mm_storeu_si128(output_mm + 11, r2_3);
289 _mm_storeu_si128(output_mm + 12, r3_0);
290 _mm_storeu_si128(output_mm + 13, r3_1);
291 _mm_storeu_si128(output_mm + 14, r3_2);
292 _mm_storeu_si128(output_mm + 15, r3_3);
297 if (schedule[12] < 4) {
constexpr static const std::size_t block_size
Definition: chacha_policy.hpp:53
constexpr static const std::size_t key_schedule_bits
Definition: chacha_policy.hpp:66
constexpr static const std::size_t rounds
Definition: chacha_policy.hpp:47
constexpr static const std::size_t word_bits
Definition: chacha_policy.hpp:44
basic_functions< 32 >::word_type word_type
Definition: chacha_policy.hpp:45
constexpr static const std::size_t block_bits
Definition: chacha_policy.hpp:54
std::array< word_type, key_schedule_size > key_schedule_type
Definition: chacha_policy.hpp:67
std::array< byte_type, block_size > block_type
Definition: chacha_policy.hpp:55
constexpr static const std::size_t key_schedule_size
Definition: chacha_policy.hpp:65
Definition: chacha_sse2_impl.hpp:39
constexpr static const std::size_t block_size
Definition: chacha_sse2_impl.hpp:52
constexpr static const std::size_t min_key_schedule_size
Definition: chacha_sse2_impl.hpp:48
static void chacha_x8(const std::array< std::uint8_t, block_size *8 > &block, key_schedule_type &schedule)
Definition: chacha_sse2_impl.hpp:55
constexpr static const std::size_t block_bits
Definition: chacha_sse2_impl.hpp:51
policy_type::block_type block_type
Definition: chacha_sse2_impl.hpp:53
chacha_policy< Round, IVSize, KeyBits > policy_type
Definition: chacha_sse2_impl.hpp:40
static BOOST_ATTRIBUTE_TARGET("sse2") void chacha_x4(const std
Definition: chacha_sse2_impl.hpp:62
policy_type::key_schedule_type key_schedule_type
Definition: chacha_sse2_impl.hpp:49
constexpr static const std::size_t min_key_schedule_bits
Definition: chacha_sse2_impl.hpp:47
policy_type::word_type word_type
Definition: chacha_sse2_impl.hpp:43
constexpr static const std::size_t rounds
Definition: chacha_sse2_impl.hpp:45
constexpr static const std::size_t word_bits
Definition: chacha_sse2_impl.hpp:42