25 #ifndef CRYPTO3_STREAM_CHACHA_AVX2_IMPL_HPP
26 #define CRYPTO3_STREAM_CHACHA_AVX2_IMPL_HPP
28 #include <nil/crypto3/detail/config.hpp>
32 #include <immintrin.h>
38 template<std::
size_t Round, std::
size_t IVSize, std::
size_t KeyBits>
60 const __m256i CTR0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
62 const word_type C = 0xFFFFFFFF - schedule[12];
63 const __m256i CTR1 = _mm256_set_epi32(C < 7, C < 6, C < 5, C < 4, C < 3, C < 2, C < 1, 0);
65 __m256i R00 = _mm256_set1_epi32(schedule[0]);
66 __m256i R01 = _mm256_set1_epi32(schedule[1]);
67 __m256i R02 = _mm256_set1_epi32(schedule[2]);
68 __m256i R03 = _mm256_set1_epi32(schedule[3]);
69 __m256i R04 = _mm256_set1_epi32(schedule[4]);
70 __m256i R05 = _mm256_set1_epi32(schedule[5]);
71 __m256i R06 = _mm256_set1_epi32(schedule[6]);
72 __m256i R07 = _mm256_set1_epi32(schedule[7]);
73 __m256i R08 = _mm256_set1_epi32(schedule[8]);
74 __m256i R09 = _mm256_set1_epi32(schedule[9]);
75 __m256i R10 = _mm256_set1_epi32(schedule[10]);
76 __m256i R11 = _mm256_set1_epi32(schedule[11]);
77 __m256i R12 = _mm256_set1_epi32(schedule[12]) + CTR0;
78 __m256i R13 = _mm256_set1_epi32(schedule[13]) + CTR1;
79 __m256i R14 = _mm256_set1_epi32(schedule[14]);
80 __m256i R15 = _mm256_set1_epi32(schedule[15]);
82 for (
size_t r = 0; r !=
rounds / 2; ++r) {
93 const __m256i shuf_rotl_16 =
94 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9,
95 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
97 R12 = _mm256_shuffle_epi8(R12, shuf_rotl_16);
98 R13 = _mm256_shuffle_epi8(R13, shuf_rotl_16);
99 R14 = _mm256_shuffle_epi8(R14, shuf_rotl_16);
100 R15 = _mm256_shuffle_epi8(R15, shuf_rotl_16);
112 R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 12), _mm256_srli_epi32(R04, 32 - 12));
113 R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 12), _mm256_srli_epi32(R05, 32 - 12));
114 R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 12), _mm256_srli_epi32(R06, 32 - 12));
115 R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 12), _mm256_srli_epi32(R07, 32 - 12));
127 const __m256i shuf_rotl_8 =
128 _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15,
129 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
131 R12 = _mm256_shuffle_epi8(R12, shuf_rotl_8);
132 R13 = _mm256_shuffle_epi8(R13, shuf_rotl_8);
133 R14 = _mm256_shuffle_epi8(R14, shuf_rotl_8);
134 R15 = _mm256_shuffle_epi8(R15, shuf_rotl_8);
146 R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 7), _mm256_srli_epi32(R04, 32 - 7));
147 R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 7), _mm256_srli_epi32(R05, 32 - 7));
148 R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 7), _mm256_srli_epi32(R06, 32 - 7));
149 R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 7), _mm256_srli_epi32(R07, 32 - 7));
161 R15 = _mm256_shuffle_epi8(R15, shuf_rotl_16);
162 R12 = _mm256_shuffle_epi8(R12, shuf_rotl_16);
163 R13 = _mm256_shuffle_epi8(R13, shuf_rotl_16);
164 R14 = _mm256_shuffle_epi8(R14, shuf_rotl_16);
176 R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 12), _mm256_srli_epi32(R05, 32 - 12));
177 R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 12), _mm256_srli_epi32(R06, 32 - 12));
178 R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 12), _mm256_srli_epi32(R07, 32 - 12));
179 R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 12), _mm256_srli_epi32(R04, 32 - 12));
191 R15 = _mm256_shuffle_epi8(R15, shuf_rotl_8);
192 R12 = _mm256_shuffle_epi8(R12, shuf_rotl_8);
193 R13 = _mm256_shuffle_epi8(R13, shuf_rotl_8);
194 R14 = _mm256_shuffle_epi8(R14, shuf_rotl_8);
206 R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 7), _mm256_srli_epi32(R05, 32 - 7));
207 R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 7), _mm256_srli_epi32(R06, 32 - 7));
208 R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 7), _mm256_srli_epi32(R07, 32 - 7));
209 R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 7), _mm256_srli_epi32(R04, 32 - 7));
212 R00 += _mm256_set1_epi32(schedule[0]);
213 R01 += _mm256_set1_epi32(schedule[1]);
214 R02 += _mm256_set1_epi32(schedule[2]);
215 R03 += _mm256_set1_epi32(schedule[3]);
216 R04 += _mm256_set1_epi32(schedule[4]);
217 R05 += _mm256_set1_epi32(schedule[5]);
218 R06 += _mm256_set1_epi32(schedule[6]);
219 R07 += _mm256_set1_epi32(schedule[7]);
220 R08 += _mm256_set1_epi32(schedule[8]);
221 R09 += _mm256_set1_epi32(schedule[9]);
222 R10 += _mm256_set1_epi32(schedule[10]);
223 R11 += _mm256_set1_epi32(schedule[11]);
224 R12 += _mm256_set1_epi32(schedule[12]) + CTR0;
225 R13 += _mm256_set1_epi32(schedule[13]) + CTR1;
226 R14 += _mm256_set1_epi32(schedule[14]);
227 R15 += _mm256_set1_epi32(schedule[15]);
229 __m256i T0 = _mm256_unpacklo_epi32(R00, R01);
230 __m256i T1 = _mm256_unpacklo_epi32(R02, R03);
231 __m256i T2 = _mm256_unpackhi_epi32(R00, R01);
232 __m256i T3 = _mm256_unpackhi_epi32(R02, R03);
234 R00 = _mm256_unpacklo_epi64(T0, T1);
235 R01 = _mm256_unpackhi_epi64(T0, T1);
236 R02 = _mm256_unpacklo_epi64(T2, T3);
237 R03 = _mm256_unpackhi_epi64(T2, T3);
239 T0 = _mm256_unpacklo_epi32(R04, R05);
240 T1 = _mm256_unpacklo_epi32(R06, R07);
241 T2 = _mm256_unpackhi_epi32(R04, R05);
242 T3 = _mm256_unpackhi_epi32(R06, R07);
244 R04 = _mm256_unpacklo_epi64(T0, T1);
245 R05 = _mm256_unpackhi_epi64(T0, T1);
246 R06 = _mm256_unpacklo_epi64(T2, T3);
247 R07 = _mm256_unpackhi_epi64(T2, T3);
249 T0 = _mm256_unpacklo_epi32(R08, R09);
250 T1 = _mm256_unpacklo_epi32(R10, R11);
251 T2 = _mm256_unpackhi_epi32(R08, R09);
252 T3 = _mm256_unpackhi_epi32(R10, R11);
254 R08 = _mm256_unpacklo_epi64(T0, T1);
255 R09 = _mm256_unpackhi_epi64(T0, T1);
256 R10 = _mm256_unpacklo_epi64(T2, T3);
257 R11 = _mm256_unpackhi_epi64(T2, T3);
259 T0 = _mm256_unpacklo_epi32(R12, R13);
260 T1 = _mm256_unpacklo_epi32(R14, R15);
261 T2 = _mm256_unpackhi_epi32(R12, R13);
262 T3 = _mm256_unpackhi_epi32(R14, R15);
264 R12 = _mm256_unpacklo_epi64(T0, T1);
265 R13 = _mm256_unpackhi_epi64(T0, T1);
266 R14 = _mm256_unpacklo_epi64(T2, T3);
267 R15 = _mm256_unpackhi_epi64(T2, T3);
269 __m256i *output_mm =
reinterpret_cast<__m256i *
>(
block.data());
271 _mm256_storeu_si256(output_mm, _mm256_permute2x128_si256(R00, R04, 0 + (2 << 4)));
272 _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(R08, R12, 0 + (2 << 4)));
273 _mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(R01, R05, 0 + (2 << 4)));
274 _mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(R09, R13, 0 + (2 << 4)));
275 _mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(R02, R06, 0 + (2 << 4)));
276 _mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(R10, R14, 0 + (2 << 4)));
277 _mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(R03, R07, 0 + (2 << 4)));
278 _mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(R11, R15, 0 + (2 << 4)));
280 _mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(R00, R04, 1 + (3 << 4)));
281 _mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(R08, R12, 1 + (3 << 4)));
282 _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(R01, R05, 1 + (3 << 4)));
283 _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(R09, R13, 1 + (3 << 4)));
284 _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(R02, R06, 1 + (3 << 4)));
285 _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(R10, R14, 1 + (3 << 4)));
286 _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(R03, R07, 1 + (3 << 4)));
287 _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(R11, R15, 1 + (3 << 4)));
292 if (schedule[12] < 8)
Definition: chacha_avx2_impl.hpp:39
constexpr static const std::size_t rounds
Definition: chacha_avx2_impl.hpp:45
policy_type::word_type word_type
Definition: chacha_avx2_impl.hpp:43
constexpr static const std::size_t block_bits
Definition: chacha_avx2_impl.hpp:51
constexpr static const std::size_t block_size
Definition: chacha_avx2_impl.hpp:52
policy_type::block_type block_type
Definition: chacha_avx2_impl.hpp:53
constexpr static const std::size_t min_key_schedule_size
Definition: chacha_avx2_impl.hpp:48
constexpr static const std::size_t min_key_schedule_bits
Definition: chacha_avx2_impl.hpp:47
policy_type::key_schedule_type key_schedule_type
Definition: chacha_avx2_impl.hpp:49
constexpr static const std::size_t word_bits
Definition: chacha_avx2_impl.hpp:42
chacha_policy< Round, IVSize, KeyBits > policy_type
Definition: chacha_avx2_impl.hpp:40
static BOOST_ATTRIBUTE_TARGET("avx2") void chacha_x8(const std
Definition: chacha_avx2_impl.hpp:55
constexpr static const std::size_t block_size
Definition: chacha_policy.hpp:53
constexpr static const std::size_t key_schedule_bits
Definition: chacha_policy.hpp:66
constexpr static const std::size_t rounds
Definition: chacha_policy.hpp:47
constexpr static const std::size_t word_bits
Definition: chacha_policy.hpp:44
basic_functions< 32 >::word_type word_type
Definition: chacha_policy.hpp:45
constexpr static const std::size_t block_bits
Definition: chacha_policy.hpp:54
std::array< word_type, key_schedule_size > key_schedule_type
Definition: chacha_policy.hpp:67
std::array< byte_type, block_size > block_type
Definition: chacha_policy.hpp:55
constexpr static const std::size_t key_schedule_size
Definition: chacha_policy.hpp:65