chacha_sse2_impl.hpp
Go to the documentation of this file.
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2019 Mikhail Komarov <nemo@nil.foundation>
3 //
4 // MIT License
5 //
6 // Permission is hereby granted, free of charge, to any person obtaining a copy
7 // of this software and associated documentation files (the "Software"), to deal
8 // in the Software without restriction, including without limitation the rights
9 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 // copies of the Software, and to permit persons to whom the Software is
11 // furnished to do so, subject to the following conditions:
12 //
13 // The above copyright notice and this permission notice shall be included in all
14 // copies or substantial portions of the Software.
15 //
16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 // SOFTWARE.
23 //---------------------------------------------------------------------------//
24 
25 #ifndef CRYPTO3_STREAM_CHACHA_SSE2_IMPL_HPP
26 #define CRYPTO3_STREAM_CHACHA_SSE2_IMPL_HPP
27 
28 #include <nil/crypto3/detail/config.hpp>
29 
31 
32 #include <emmintrin.h>
33 
34 namespace nil {
35  namespace crypto3 {
36  namespace stream {
37  namespace detail {
38  template<std::size_t Round, std::size_t IVSize, std::size_t KeyBits>
41 
42  constexpr static const std::size_t word_bits = policy_type::word_bits;
44 
45  constexpr static const std::size_t rounds = policy_type::rounds;
46 
47  constexpr static const std::size_t min_key_schedule_bits = policy_type::key_schedule_bits;
48  constexpr static const std::size_t min_key_schedule_size = policy_type::key_schedule_size;
50 
51  constexpr static const std::size_t block_bits = policy_type::block_bits;
52  constexpr static const std::size_t block_size = policy_type::block_size;
54 
55  inline static void chacha_x8(const std::array<std::uint8_t, block_size * 8> &block,
56  key_schedule_type &schedule) {
57  chacha_x4(block, schedule);
58  chacha_x4(std::array<std::uint8_t, block_size * 4>(block.begin() + block_size * 4, block.end()),
59  schedule);
60  }
61 
62  static BOOST_ATTRIBUTE_TARGET("sse2") void chacha_x4(
63  const std::array<std::uint8_t, block_size * 4> &block,
64  key_schedule_type &schedule) {
65  const __m128i *input_mm = reinterpret_cast<const __m128i *>(schedule);
66  __m128i *output_mm = reinterpret_cast<__m128i *>(block);
67 
68  __m128i input0 = _mm_loadu_si128(input_mm);
69  __m128i input1 = _mm_loadu_si128(input_mm + 1);
70  __m128i input2 = _mm_loadu_si128(input_mm + 2);
71  __m128i input3 = _mm_loadu_si128(input_mm + 3);
72 
73  // TODO: try transposing, which would avoid the permutations each round
74 
75 #define mm_rotl(r, n) _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32 - (n)))
76 
77  __m128i r0_0 = input0;
78  __m128i r0_1 = input1;
79  __m128i r0_2 = input2;
80  __m128i r0_3 = input3;
81 
82  __m128i r1_0 = input0;
83  __m128i r1_1 = input1;
84  __m128i r1_2 = input2;
85  __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
86 
87  __m128i r2_0 = input0;
88  __m128i r2_1 = input1;
89  __m128i r2_2 = input2;
90  __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
91 
92  __m128i r3_0 = input0;
93  __m128i r3_1 = input1;
94  __m128i r3_2 = input2;
95  __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
96 
97  for (size_t r = 0; r != rounds / 2; ++r) {
98  r0_0 = _mm_add_epi32(r0_0, r0_1);
99  r1_0 = _mm_add_epi32(r1_0, r1_1);
100  r2_0 = _mm_add_epi32(r2_0, r2_1);
101  r3_0 = _mm_add_epi32(r3_0, r3_1);
102 
103  r0_3 = _mm_xor_si128(r0_3, r0_0);
104  r1_3 = _mm_xor_si128(r1_3, r1_0);
105  r2_3 = _mm_xor_si128(r2_3, r2_0);
106  r3_3 = _mm_xor_si128(r3_3, r3_0);
107 
108  r0_3 = mm_rotl(r0_3, 16);
109  r1_3 = mm_rotl(r1_3, 16);
110  r2_3 = mm_rotl(r2_3, 16);
111  r3_3 = mm_rotl(r3_3, 16);
112 
113  r0_2 = _mm_add_epi32(r0_2, r0_3);
114  r1_2 = _mm_add_epi32(r1_2, r1_3);
115  r2_2 = _mm_add_epi32(r2_2, r2_3);
116  r3_2 = _mm_add_epi32(r3_2, r3_3);
117 
118  r0_1 = _mm_xor_si128(r0_1, r0_2);
119  r1_1 = _mm_xor_si128(r1_1, r1_2);
120  r2_1 = _mm_xor_si128(r2_1, r2_2);
121  r3_1 = _mm_xor_si128(r3_1, r3_2);
122 
123  r0_1 = mm_rotl(r0_1, 12);
124  r1_1 = mm_rotl(r1_1, 12);
125  r2_1 = mm_rotl(r2_1, 12);
126  r3_1 = mm_rotl(r3_1, 12);
127 
128  r0_0 = _mm_add_epi32(r0_0, r0_1);
129  r1_0 = _mm_add_epi32(r1_0, r1_1);
130  r2_0 = _mm_add_epi32(r2_0, r2_1);
131  r3_0 = _mm_add_epi32(r3_0, r3_1);
132 
133  r0_3 = _mm_xor_si128(r0_3, r0_0);
134  r1_3 = _mm_xor_si128(r1_3, r1_0);
135  r2_3 = _mm_xor_si128(r2_3, r2_0);
136  r3_3 = _mm_xor_si128(r3_3, r3_0);
137 
138  r0_3 = mm_rotl(r0_3, 8);
139  r1_3 = mm_rotl(r1_3, 8);
140  r2_3 = mm_rotl(r2_3, 8);
141  r3_3 = mm_rotl(r3_3, 8);
142 
143  r0_2 = _mm_add_epi32(r0_2, r0_3);
144  r1_2 = _mm_add_epi32(r1_2, r1_3);
145  r2_2 = _mm_add_epi32(r2_2, r2_3);
146  r3_2 = _mm_add_epi32(r3_2, r3_3);
147 
148  r0_1 = _mm_xor_si128(r0_1, r0_2);
149  r1_1 = _mm_xor_si128(r1_1, r1_2);
150  r2_1 = _mm_xor_si128(r2_1, r2_2);
151  r3_1 = _mm_xor_si128(r3_1, r3_2);
152 
153  r0_1 = mm_rotl(r0_1, 7);
154  r1_1 = mm_rotl(r1_1, 7);
155  r2_1 = mm_rotl(r2_1, 7);
156  r3_1 = mm_rotl(r3_1, 7);
157 
158  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
159  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
160  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
161 
162  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
163  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
164  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
165 
166  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
167  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
168  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
169 
170  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
171  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
172  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
173 
174  r0_0 = _mm_add_epi32(r0_0, r0_1);
175  r1_0 = _mm_add_epi32(r1_0, r1_1);
176  r2_0 = _mm_add_epi32(r2_0, r2_1);
177  r3_0 = _mm_add_epi32(r3_0, r3_1);
178 
179  r0_3 = _mm_xor_si128(r0_3, r0_0);
180  r1_3 = _mm_xor_si128(r1_3, r1_0);
181  r2_3 = _mm_xor_si128(r2_3, r2_0);
182  r3_3 = _mm_xor_si128(r3_3, r3_0);
183 
184  r0_3 = mm_rotl(r0_3, 16);
185  r1_3 = mm_rotl(r1_3, 16);
186  r2_3 = mm_rotl(r2_3, 16);
187  r3_3 = mm_rotl(r3_3, 16);
188 
189  r0_2 = _mm_add_epi32(r0_2, r0_3);
190  r1_2 = _mm_add_epi32(r1_2, r1_3);
191  r2_2 = _mm_add_epi32(r2_2, r2_3);
192  r3_2 = _mm_add_epi32(r3_2, r3_3);
193 
194  r0_1 = _mm_xor_si128(r0_1, r0_2);
195  r1_1 = _mm_xor_si128(r1_1, r1_2);
196  r2_1 = _mm_xor_si128(r2_1, r2_2);
197  r3_1 = _mm_xor_si128(r3_1, r3_2);
198 
199  r0_1 = mm_rotl(r0_1, 12);
200  r1_1 = mm_rotl(r1_1, 12);
201  r2_1 = mm_rotl(r2_1, 12);
202  r3_1 = mm_rotl(r3_1, 12);
203 
204  r0_0 = _mm_add_epi32(r0_0, r0_1);
205  r1_0 = _mm_add_epi32(r1_0, r1_1);
206  r2_0 = _mm_add_epi32(r2_0, r2_1);
207  r3_0 = _mm_add_epi32(r3_0, r3_1);
208 
209  r0_3 = _mm_xor_si128(r0_3, r0_0);
210  r1_3 = _mm_xor_si128(r1_3, r1_0);
211  r2_3 = _mm_xor_si128(r2_3, r2_0);
212  r3_3 = _mm_xor_si128(r3_3, r3_0);
213 
214  r0_3 = mm_rotl(r0_3, 8);
215  r1_3 = mm_rotl(r1_3, 8);
216  r2_3 = mm_rotl(r2_3, 8);
217  r3_3 = mm_rotl(r3_3, 8);
218 
219  r0_2 = _mm_add_epi32(r0_2, r0_3);
220  r1_2 = _mm_add_epi32(r1_2, r1_3);
221  r2_2 = _mm_add_epi32(r2_2, r2_3);
222  r3_2 = _mm_add_epi32(r3_2, r3_3);
223 
224  r0_1 = _mm_xor_si128(r0_1, r0_2);
225  r1_1 = _mm_xor_si128(r1_1, r1_2);
226  r2_1 = _mm_xor_si128(r2_1, r2_2);
227  r3_1 = _mm_xor_si128(r3_1, r3_2);
228 
229  r0_1 = mm_rotl(r0_1, 7);
230  r1_1 = mm_rotl(r1_1, 7);
231  r2_1 = mm_rotl(r2_1, 7);
232  r3_1 = mm_rotl(r3_1, 7);
233 
234  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
235  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
236  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
237 
238  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
239  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
240  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
241 
242  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
243  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
244  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
245 
246  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
247  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
248  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
249  }
250 
251  r0_0 = _mm_add_epi32(r0_0, input0);
252  r0_1 = _mm_add_epi32(r0_1, input1);
253  r0_2 = _mm_add_epi32(r0_2, input2);
254  r0_3 = _mm_add_epi32(r0_3, input3);
255 
256  r1_0 = _mm_add_epi32(r1_0, input0);
257  r1_1 = _mm_add_epi32(r1_1, input1);
258  r1_2 = _mm_add_epi32(r1_2, input2);
259  r1_3 = _mm_add_epi32(r1_3, input3);
260  r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
261 
262  r2_0 = _mm_add_epi32(r2_0, input0);
263  r2_1 = _mm_add_epi32(r2_1, input1);
264  r2_2 = _mm_add_epi32(r2_2, input2);
265  r2_3 = _mm_add_epi32(r2_3, input3);
266  r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
267 
268  r3_0 = _mm_add_epi32(r3_0, input0);
269  r3_1 = _mm_add_epi32(r3_1, input1);
270  r3_2 = _mm_add_epi32(r3_2, input2);
271  r3_3 = _mm_add_epi32(r3_3, input3);
272  r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
273 
274  _mm_storeu_si128(output_mm + 0, r0_0);
275  _mm_storeu_si128(output_mm + 1, r0_1);
276  _mm_storeu_si128(output_mm + 2, r0_2);
277  _mm_storeu_si128(output_mm + 3, r0_3);
278 
279  _mm_storeu_si128(output_mm + 4, r1_0);
280  _mm_storeu_si128(output_mm + 5, r1_1);
281  _mm_storeu_si128(output_mm + 6, r1_2);
282  _mm_storeu_si128(output_mm + 7, r1_3);
283 
284  _mm_storeu_si128(output_mm + 8, r2_0);
285  _mm_storeu_si128(output_mm + 9, r2_1);
286  _mm_storeu_si128(output_mm + 10, r2_2);
287  _mm_storeu_si128(output_mm + 11, r2_3);
288 
289  _mm_storeu_si128(output_mm + 12, r3_0);
290  _mm_storeu_si128(output_mm + 13, r3_1);
291  _mm_storeu_si128(output_mm + 14, r3_2);
292  _mm_storeu_si128(output_mm + 15, r3_3);
293 
294 #undef mm_rotl
295 
296  schedule[12] += 4;
297  if (schedule[12] < 4) {
298  schedule[13]++;
299  }
300  }
301  };
302  } // namespace detail
303  } // namespace stream
304  } // namespace crypto3
305 } // namespace nil
306 
307 #endif // CRYPTO3_CHACHA_SSE2_IMPL_HPP
#define mm_rotl(r, n)
boost::mpl::apply< AccumulatorSet, tag::block< Mode > >::type::result_type block(const AccumulatorSet &acc)
Definition: accumulators/block.hpp:259
boost::mpl::apply< AccumulatorSet, tag::stream< Mode > >::type::result_type stream(const AccumulatorSet &acc)
Definition: accumulators/stream.hpp:175
Definition: pair.hpp:31
constexpr static const std::size_t block_size
Definition: chacha_policy.hpp:53
constexpr static const std::size_t key_schedule_bits
Definition: chacha_policy.hpp:66
constexpr static const std::size_t rounds
Definition: chacha_policy.hpp:47
constexpr static const std::size_t word_bits
Definition: chacha_policy.hpp:44
basic_functions< 32 >::word_type word_type
Definition: chacha_policy.hpp:45
constexpr static const std::size_t block_bits
Definition: chacha_policy.hpp:54
std::array< word_type, key_schedule_size > key_schedule_type
Definition: chacha_policy.hpp:67
std::array< byte_type, block_size > block_type
Definition: chacha_policy.hpp:55
constexpr static const std::size_t key_schedule_size
Definition: chacha_policy.hpp:65
Definition: chacha_sse2_impl.hpp:39
constexpr static const std::size_t block_size
Definition: chacha_sse2_impl.hpp:52
constexpr static const std::size_t min_key_schedule_size
Definition: chacha_sse2_impl.hpp:48
static void chacha_x8(const std::array< std::uint8_t, block_size *8 > &block, key_schedule_type &schedule)
Definition: chacha_sse2_impl.hpp:55
constexpr static const std::size_t block_bits
Definition: chacha_sse2_impl.hpp:51
policy_type::block_type block_type
Definition: chacha_sse2_impl.hpp:53
chacha_policy< Round, IVSize, KeyBits > policy_type
Definition: chacha_sse2_impl.hpp:40
static BOOST_ATTRIBUTE_TARGET("sse2") void chacha_x4(const std
Definition: chacha_sse2_impl.hpp:62
policy_type::key_schedule_type key_schedule_type
Definition: chacha_sse2_impl.hpp:49
constexpr static const std::size_t min_key_schedule_bits
Definition: chacha_sse2_impl.hpp:47
policy_type::word_type word_type
Definition: chacha_sse2_impl.hpp:43
constexpr static const std::size_t rounds
Definition: chacha_sse2_impl.hpp:45
constexpr static const std::size_t word_bits
Definition: chacha_sse2_impl.hpp:42