chacha_avx2_impl.hpp
Go to the documentation of this file.
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2018-2020 Mikhail Komarov <nemo@nil.foundation>
3 //
4 // MIT License
5 //
6 // Permission is hereby granted, free of charge, to any person obtaining a copy
7 // of this software and associated documentation files (the "Software"), to deal
8 // in the Software without restriction, including without limitation the rights
9 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 // copies of the Software, and to permit persons to whom the Software is
11 // furnished to do so, subject to the following conditions:
12 //
13 // The above copyright notice and this permission notice shall be included in all
14 // copies or substantial portions of the Software.
15 //
16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 // SOFTWARE.
23 //---------------------------------------------------------------------------//
24 
25 #ifndef CRYPTO3_STREAM_CHACHA_AVX2_IMPL_HPP
26 #define CRYPTO3_STREAM_CHACHA_AVX2_IMPL_HPP
27 
28 #include <nil/crypto3/detail/config.hpp>
29 
31 
32 #include <immintrin.h>
33 
34 namespace nil {
35  namespace crypto3 {
36  namespace stream {
37  namespace detail {
38  template<std::size_t Round, std::size_t IVSize, std::size_t KeyBits>
41 
42  constexpr static const std::size_t word_bits = policy_type::word_bits;
44 
45  constexpr static const std::size_t rounds = policy_type::rounds;
46 
47  constexpr static const std::size_t min_key_schedule_bits = policy_type::key_schedule_bits;
48  constexpr static const std::size_t min_key_schedule_size = policy_type::key_schedule_size;
50 
51  constexpr static const std::size_t block_bits = policy_type::block_bits;
52  constexpr static const std::size_t block_size = policy_type::block_size;
54 
55  static BOOST_ATTRIBUTE_TARGET("avx2") void chacha_x8(
56  const std::array<std::uint8_t, block_size * 8> &block,
57  key_schedule_type &schedule) {
58  _mm256_zeroupper();
59 
60  const __m256i CTR0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
61 
62  const word_type C = 0xFFFFFFFF - schedule[12];
63  const __m256i CTR1 = _mm256_set_epi32(C < 7, C < 6, C < 5, C < 4, C < 3, C < 2, C < 1, 0);
64 
65  __m256i R00 = _mm256_set1_epi32(schedule[0]);
66  __m256i R01 = _mm256_set1_epi32(schedule[1]);
67  __m256i R02 = _mm256_set1_epi32(schedule[2]);
68  __m256i R03 = _mm256_set1_epi32(schedule[3]);
69  __m256i R04 = _mm256_set1_epi32(schedule[4]);
70  __m256i R05 = _mm256_set1_epi32(schedule[5]);
71  __m256i R06 = _mm256_set1_epi32(schedule[6]);
72  __m256i R07 = _mm256_set1_epi32(schedule[7]);
73  __m256i R08 = _mm256_set1_epi32(schedule[8]);
74  __m256i R09 = _mm256_set1_epi32(schedule[9]);
75  __m256i R10 = _mm256_set1_epi32(schedule[10]);
76  __m256i R11 = _mm256_set1_epi32(schedule[11]);
77  __m256i R12 = _mm256_set1_epi32(schedule[12]) + CTR0;
78  __m256i R13 = _mm256_set1_epi32(schedule[13]) + CTR1;
79  __m256i R14 = _mm256_set1_epi32(schedule[14]);
80  __m256i R15 = _mm256_set1_epi32(schedule[15]);
81 
82  for (size_t r = 0; r != rounds / 2; ++r) {
83  R00 += R04;
84  R01 += R05;
85  R02 += R06;
86  R03 += R07;
87 
88  R12 ^= R00;
89  R13 ^= R01;
90  R14 ^= R02;
91  R15 ^= R03;
92 
93  const __m256i shuf_rotl_16 =
94  _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9,
95  8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
96 
97  R12 = _mm256_shuffle_epi8(R12, shuf_rotl_16);
98  R13 = _mm256_shuffle_epi8(R13, shuf_rotl_16);
99  R14 = _mm256_shuffle_epi8(R14, shuf_rotl_16);
100  R15 = _mm256_shuffle_epi8(R15, shuf_rotl_16);
101 
102  R08 += R12;
103  R09 += R13;
104  R10 += R14;
105  R11 += R15;
106 
107  R04 ^= R08;
108  R05 ^= R09;
109  R06 ^= R10;
110  R07 ^= R11;
111 
112  R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 12), _mm256_srli_epi32(R04, 32 - 12));
113  R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 12), _mm256_srli_epi32(R05, 32 - 12));
114  R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 12), _mm256_srli_epi32(R06, 32 - 12));
115  R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 12), _mm256_srli_epi32(R07, 32 - 12));
116 
117  R00 += R04;
118  R01 += R05;
119  R02 += R06;
120  R03 += R07;
121 
122  R12 ^= R00;
123  R13 ^= R01;
124  R14 ^= R02;
125  R15 ^= R03;
126 
127  const __m256i shuf_rotl_8 =
128  _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15,
129  10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
130 
131  R12 = _mm256_shuffle_epi8(R12, shuf_rotl_8);
132  R13 = _mm256_shuffle_epi8(R13, shuf_rotl_8);
133  R14 = _mm256_shuffle_epi8(R14, shuf_rotl_8);
134  R15 = _mm256_shuffle_epi8(R15, shuf_rotl_8);
135 
136  R08 += R12;
137  R09 += R13;
138  R10 += R14;
139  R11 += R15;
140 
141  R04 ^= R08;
142  R05 ^= R09;
143  R06 ^= R10;
144  R07 ^= R11;
145 
146  R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 7), _mm256_srli_epi32(R04, 32 - 7));
147  R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 7), _mm256_srli_epi32(R05, 32 - 7));
148  R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 7), _mm256_srli_epi32(R06, 32 - 7));
149  R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 7), _mm256_srli_epi32(R07, 32 - 7));
150 
151  R00 += R05;
152  R01 += R06;
153  R02 += R07;
154  R03 += R04;
155 
156  R15 ^= R00;
157  R12 ^= R01;
158  R13 ^= R02;
159  R14 ^= R03;
160 
161  R15 = _mm256_shuffle_epi8(R15, shuf_rotl_16);
162  R12 = _mm256_shuffle_epi8(R12, shuf_rotl_16);
163  R13 = _mm256_shuffle_epi8(R13, shuf_rotl_16);
164  R14 = _mm256_shuffle_epi8(R14, shuf_rotl_16);
165 
166  R10 += R15;
167  R11 += R12;
168  R08 += R13;
169  R09 += R14;
170 
171  R05 ^= R10;
172  R06 ^= R11;
173  R07 ^= R08;
174  R04 ^= R09;
175 
176  R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 12), _mm256_srli_epi32(R05, 32 - 12));
177  R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 12), _mm256_srli_epi32(R06, 32 - 12));
178  R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 12), _mm256_srli_epi32(R07, 32 - 12));
179  R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 12), _mm256_srli_epi32(R04, 32 - 12));
180 
181  R00 += R05;
182  R01 += R06;
183  R02 += R07;
184  R03 += R04;
185 
186  R15 ^= R00;
187  R12 ^= R01;
188  R13 ^= R02;
189  R14 ^= R03;
190 
191  R15 = _mm256_shuffle_epi8(R15, shuf_rotl_8);
192  R12 = _mm256_shuffle_epi8(R12, shuf_rotl_8);
193  R13 = _mm256_shuffle_epi8(R13, shuf_rotl_8);
194  R14 = _mm256_shuffle_epi8(R14, shuf_rotl_8);
195 
196  R10 += R15;
197  R11 += R12;
198  R08 += R13;
199  R09 += R14;
200 
201  R05 ^= R10;
202  R06 ^= R11;
203  R07 ^= R08;
204  R04 ^= R09;
205 
206  R05 = _mm256_or_si256(_mm256_slli_epi32(R05, 7), _mm256_srli_epi32(R05, 32 - 7));
207  R06 = _mm256_or_si256(_mm256_slli_epi32(R06, 7), _mm256_srli_epi32(R06, 32 - 7));
208  R07 = _mm256_or_si256(_mm256_slli_epi32(R07, 7), _mm256_srli_epi32(R07, 32 - 7));
209  R04 = _mm256_or_si256(_mm256_slli_epi32(R04, 7), _mm256_srli_epi32(R04, 32 - 7));
210  }
211 
212  R00 += _mm256_set1_epi32(schedule[0]);
213  R01 += _mm256_set1_epi32(schedule[1]);
214  R02 += _mm256_set1_epi32(schedule[2]);
215  R03 += _mm256_set1_epi32(schedule[3]);
216  R04 += _mm256_set1_epi32(schedule[4]);
217  R05 += _mm256_set1_epi32(schedule[5]);
218  R06 += _mm256_set1_epi32(schedule[6]);
219  R07 += _mm256_set1_epi32(schedule[7]);
220  R08 += _mm256_set1_epi32(schedule[8]);
221  R09 += _mm256_set1_epi32(schedule[9]);
222  R10 += _mm256_set1_epi32(schedule[10]);
223  R11 += _mm256_set1_epi32(schedule[11]);
224  R12 += _mm256_set1_epi32(schedule[12]) + CTR0;
225  R13 += _mm256_set1_epi32(schedule[13]) + CTR1;
226  R14 += _mm256_set1_epi32(schedule[14]);
227  R15 += _mm256_set1_epi32(schedule[15]);
228 
229  __m256i T0 = _mm256_unpacklo_epi32(R00, R01);
230  __m256i T1 = _mm256_unpacklo_epi32(R02, R03);
231  __m256i T2 = _mm256_unpackhi_epi32(R00, R01);
232  __m256i T3 = _mm256_unpackhi_epi32(R02, R03);
233 
234  R00 = _mm256_unpacklo_epi64(T0, T1);
235  R01 = _mm256_unpackhi_epi64(T0, T1);
236  R02 = _mm256_unpacklo_epi64(T2, T3);
237  R03 = _mm256_unpackhi_epi64(T2, T3);
238 
239  T0 = _mm256_unpacklo_epi32(R04, R05);
240  T1 = _mm256_unpacklo_epi32(R06, R07);
241  T2 = _mm256_unpackhi_epi32(R04, R05);
242  T3 = _mm256_unpackhi_epi32(R06, R07);
243 
244  R04 = _mm256_unpacklo_epi64(T0, T1);
245  R05 = _mm256_unpackhi_epi64(T0, T1);
246  R06 = _mm256_unpacklo_epi64(T2, T3);
247  R07 = _mm256_unpackhi_epi64(T2, T3);
248 
249  T0 = _mm256_unpacklo_epi32(R08, R09);
250  T1 = _mm256_unpacklo_epi32(R10, R11);
251  T2 = _mm256_unpackhi_epi32(R08, R09);
252  T3 = _mm256_unpackhi_epi32(R10, R11);
253 
254  R08 = _mm256_unpacklo_epi64(T0, T1);
255  R09 = _mm256_unpackhi_epi64(T0, T1);
256  R10 = _mm256_unpacklo_epi64(T2, T3);
257  R11 = _mm256_unpackhi_epi64(T2, T3);
258 
259  T0 = _mm256_unpacklo_epi32(R12, R13);
260  T1 = _mm256_unpacklo_epi32(R14, R15);
261  T2 = _mm256_unpackhi_epi32(R12, R13);
262  T3 = _mm256_unpackhi_epi32(R14, R15);
263 
264  R12 = _mm256_unpacklo_epi64(T0, T1);
265  R13 = _mm256_unpackhi_epi64(T0, T1);
266  R14 = _mm256_unpacklo_epi64(T2, T3);
267  R15 = _mm256_unpackhi_epi64(T2, T3);
268 
269  __m256i *output_mm = reinterpret_cast<__m256i *>(block.data());
270 
271  _mm256_storeu_si256(output_mm, _mm256_permute2x128_si256(R00, R04, 0 + (2 << 4)));
272  _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(R08, R12, 0 + (2 << 4)));
273  _mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(R01, R05, 0 + (2 << 4)));
274  _mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(R09, R13, 0 + (2 << 4)));
275  _mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(R02, R06, 0 + (2 << 4)));
276  _mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(R10, R14, 0 + (2 << 4)));
277  _mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(R03, R07, 0 + (2 << 4)));
278  _mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(R11, R15, 0 + (2 << 4)));
279 
280  _mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(R00, R04, 1 + (3 << 4)));
281  _mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(R08, R12, 1 + (3 << 4)));
282  _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(R01, R05, 1 + (3 << 4)));
283  _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(R09, R13, 1 + (3 << 4)));
284  _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(R02, R06, 1 + (3 << 4)));
285  _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(R10, R14, 1 + (3 << 4)));
286  _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(R03, R07, 1 + (3 << 4)));
287  _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(R11, R15, 1 + (3 << 4)));
288 
289  _mm256_zeroall();
290 
291  schedule[12] += 8;
292  if (schedule[12] < 8)
293  schedule[13]++;
294  }
295  };
296  } // namespace detail
297  } // namespace stream
298  } // namespace crypto3
299 } // namespace nil
300 
301 #endif // CRYPTO3_CHACHA_AVX2_IMPL_HPP
boost::mpl::apply< AccumulatorSet, tag::block< Mode > >::type::result_type block(const AccumulatorSet &acc)
Definition: accumulators/block.hpp:259
boost::mpl::apply< AccumulatorSet, tag::stream< Mode > >::type::result_type stream(const AccumulatorSet &acc)
Definition: accumulators/stream.hpp:175
Definition: pair.hpp:31
Definition: chacha_avx2_impl.hpp:39
constexpr static const std::size_t rounds
Definition: chacha_avx2_impl.hpp:45
policy_type::word_type word_type
Definition: chacha_avx2_impl.hpp:43
constexpr static const std::size_t block_bits
Definition: chacha_avx2_impl.hpp:51
constexpr static const std::size_t block_size
Definition: chacha_avx2_impl.hpp:52
policy_type::block_type block_type
Definition: chacha_avx2_impl.hpp:53
constexpr static const std::size_t min_key_schedule_size
Definition: chacha_avx2_impl.hpp:48
constexpr static const std::size_t min_key_schedule_bits
Definition: chacha_avx2_impl.hpp:47
policy_type::key_schedule_type key_schedule_type
Definition: chacha_avx2_impl.hpp:49
constexpr static const std::size_t word_bits
Definition: chacha_avx2_impl.hpp:42
chacha_policy< Round, IVSize, KeyBits > policy_type
Definition: chacha_avx2_impl.hpp:40
static BOOST_ATTRIBUTE_TARGET("avx2") void chacha_x8(const std
Definition: chacha_avx2_impl.hpp:55
constexpr static const std::size_t block_size
Definition: chacha_policy.hpp:53
constexpr static const std::size_t key_schedule_bits
Definition: chacha_policy.hpp:66
constexpr static const std::size_t rounds
Definition: chacha_policy.hpp:47
constexpr static const std::size_t word_bits
Definition: chacha_policy.hpp:44
basic_functions< 32 >::word_type word_type
Definition: chacha_policy.hpp:45
constexpr static const std::size_t block_bits
Definition: chacha_policy.hpp:54
std::array< word_type, key_schedule_size > key_schedule_type
Definition: chacha_policy.hpp:67
std::array< byte_type, block_size > block_type
Definition: chacha_policy.hpp:55
constexpr static const std::size_t key_schedule_size
Definition: chacha_policy.hpp:65