rijndael_ssse3_impl.hpp
Go to the documentation of this file.
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2018-2020 Mikhail Komarov <nemo@nil.foundation>
3 //
4 // MIT License
5 //
6 // Permission is hereby granted, free of charge, to any person obtaining a copy
7 // of this software and associated documentation files (the "Software"), to deal
8 // in the Software without restriction, including without limitation the rights
9 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 // copies of the Software, and to permit persons to whom the Software is
11 // furnished to do so, subject to the following conditions:
12 //
13 // The above copyright notice and this permission notice shall be included in all
14 // copies or substantial portions of the Software.
15 //
16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 // SOFTWARE.
23 //---------------------------------------------------------------------------//
24 // @file AES using SSSE3
25 //
26 // @brief This is more or less a direct translation of public domain x86-64
27 // assembly written by Mike Hamburg, described in "Accelerating AES
28 // with Vector Permute Instructions" (CHES 2009). His original code is
29 // available at https://crypto.stanford.edu/vpaes/
30 //---------------------------------------------------------------------------//
31 
32 #ifndef CRYPTO3_SSSE3_RIJNDAEL_IMPL_HPP
33 #define CRYPTO3_SSSE3_RIJNDAEL_IMPL_HPP
34 
35 #include <cstddef>
36 
37 #include <tmmintrin.h>
38 
39 #include <boost/static_assert.hpp>
40 
41 #include <nil/crypto3/detail/config.hpp>
42 
43 namespace nil {
44  namespace crypto3 {
45  namespace block {
46  namespace detail {
51  const __m128i low_nibs = _mm_set1_epi8(0x0F);
52 
53  const __m128i k_ipt1 = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
54  const __m128i k_ipt2 = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
55 
56  const __m128i k_inv1 = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
57  const __m128i k_inv2 = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
58 
59  const __m128i sb1u = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
60  const __m128i sb1t = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
61 
62  const __m128i mc_forward[4] = {_mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
63  _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
64  _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
65  _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)};
66 
67  const __m128i sr[4] = {
68  _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
69  _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
70  _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
71  _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
72  };
73 
74 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
75 
76  BOOST_ATTRIBUTE_TARGET("ssse3")
77  __m128i aes_schedule_transform(__m128i input, __m128i table_1, __m128i table_2) {
78  __m128i i_1 = _mm_and_si128(low_nibs, input);
79  __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
80 
81  return _mm_xor_si128(_mm_shuffle_epi8(table_1, i_1), _mm_shuffle_epi8(table_2, i_2));
82  }
83 
84  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_mangle(__m128i k, uint8_t round_no) {
85  __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)), mc_forward[0]);
86 
87  __m128i t2 = t;
88 
89  t = _mm_shuffle_epi8(t, mc_forward[0]);
90 
91  t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
92 
93  return _mm_shuffle_epi8(t2, sr[round_no % 4]);
94  }
95 
96  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_192_smear(__m128i x, __m128i y) {
97  return mm_xor3(y, _mm_shuffle_epi32(x, 0xFE), _mm_shuffle_epi32(y, 0x80));
98  }
99 
100  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_mangle_dec(__m128i k, uint8_t round_no) {
101  const __m128i dsk[8] = {_mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
102  _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
103  _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
104  _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
105  _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
106  _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
107  _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
108  _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)};
109 
110  __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
111  __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
112 
113  t = aes_schedule_transform(t, dsk[2], dsk[3]);
114  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
115 
116  t = aes_schedule_transform(t, dsk[4], dsk[5]);
117  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
118 
119  t = aes_schedule_transform(t, dsk[6], dsk[7]);
120  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
121 
122  return _mm_shuffle_epi8(output, sr[round_no % 4]);
123  }
124 
125  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_mangle_last(__m128i k, uint8_t round_no) {
126  const __m128i out_tr1 = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
127  const __m128i out_tr2 = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
128 
129  k = _mm_shuffle_epi8(k, sr[round_no % 4]);
130  k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
131  return aes_schedule_transform(k, out_tr1, out_tr2);
132  }
133 
134  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_mangle_last_dec(__m128i k) {
135  const __m128i deskew1 = _mm_set_epi32(0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
136  const __m128i deskew2 = _mm_set_epi32(0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
137 
138  k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
139  return aes_schedule_transform(k, deskew1, deskew2);
140  }
141 
142  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_schedule_round(__m128i *rcon, __m128i input1, __m128i input2) {
143  if (rcon) {
144  input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), input2);
145 
146  *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon
147 
148  input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
149  input1 = _mm_alignr_epi8(input1, input1, 1);
150  }
151 
152  __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
153  smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
154 
155  __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
156 
157  input1 = _mm_and_si128(low_nibs, input1);
158 
159  __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
160 
161  input1 = _mm_xor_si128(input1, t);
162 
163  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
164  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
165 
166  __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
167  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
168 
169  return mm_xor3(_mm_shuffle_epi8(sb1u, t5), _mm_shuffle_epi8(sb1t, t6), smeared);
170  }
171 
172  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_ssse3_encrypt(__m128i B, const __m128i *keys, size_t rounds) {
173  const __m128i sb2u = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
174  const __m128i sb2t = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
175 
176  const __m128i sbou = _mm_set_epi32(0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
177  const __m128i sbot = _mm_set_epi32(0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
178 
179  const __m128i mc_backward[4] = {
180  _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
181  _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
182  _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
183  _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
184  };
185 
186  B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
187  _mm_shuffle_epi8(k_ipt2, _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4)),
188  _mm_loadu_si128(keys));
189 
190  for (size_t r = 1;; ++r) {
191  const __m128i K = _mm_loadu_si128(keys + r);
192 
193  __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
194 
195  B = _mm_and_si128(low_nibs, B);
196 
197  __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
198 
199  B = _mm_xor_si128(B, t);
200 
201  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
202  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
203 
204  __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
205  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
206 
207  if (r == rounds) {
208  B = _mm_shuffle_epi8(mm_xor3(_mm_shuffle_epi8(sbou, t5), _mm_shuffle_epi8(sbot, t6), K),
209  sr[r % 4]);
210 
211  return B;
212  }
213 
214  __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6), _mm_shuffle_epi8(sb1u, t5), K);
215 
216  __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6), _mm_shuffle_epi8(sb2u, t5),
217  _mm_shuffle_epi8(t7, mc_forward[r % 4]));
218 
219  B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]), _mm_shuffle_epi8(t7, mc_backward[r % 4]),
220  t8);
221  }
222  }
223 
224  BOOST_ATTRIBUTE_TARGET("ssse3") __m128i aes_ssse3_decrypt(__m128i B, const __m128i *keys, size_t rounds) {
225  const __m128i k_dipt1 = _mm_set_epi32(0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
226  const __m128i k_dipt2 = _mm_set_epi32(0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
227 
228  const __m128i sb9u = _mm_set_epi32(0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
229  const __m128i sb9t = _mm_set_epi32(0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
230 
231  const __m128i sbeu = _mm_set_epi32(0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
232  const __m128i sbet = _mm_set_epi32(0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
233 
234  const __m128i sbdu = _mm_set_epi32(0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
235  const __m128i sbdt = _mm_set_epi32(0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
236 
237  const __m128i sbbu = _mm_set_epi32(0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
238  const __m128i sbbt = _mm_set_epi32(0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
239 
240  __m128i mc = mc_forward[3];
241 
242  __m128i t = _mm_shuffle_epi8(k_dipt2, _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4));
243 
244  B = mm_xor3(t, _mm_loadu_si128(keys), _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
245 
246  for (size_t r = 1;; ++r) {
247  const __m128i K = _mm_loadu_si128(keys + r);
248 
249  t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
250 
251  B = _mm_and_si128(low_nibs, B);
252 
253  __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
254 
255  B = _mm_xor_si128(B, t);
256 
257  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
258  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
259  __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
260  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
261 
262  if (r == rounds) {
263  const __m128i sbou = _mm_set_epi32(0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
264  const __m128i sbot = _mm_set_epi32(0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
265 
266  __m128i x = _mm_shuffle_epi8(sbou, t5);
267  __m128i y = _mm_shuffle_epi8(sbot, t6);
268  x = _mm_xor_si128(x, K);
269  x = _mm_xor_si128(x, y);
270 
271  const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
272  return _mm_shuffle_epi8(x, sr[which_sr]);
273  }
274 
275  __m128i t8 =
276  _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
277 
278  __m128i t9 =
279  mm_xor3(_mm_shuffle_epi8(t8, mc), _mm_shuffle_epi8(sbdu, t5), _mm_shuffle_epi8(sbdt, t6));
280 
281  __m128i t12 = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t9, mc), _mm_shuffle_epi8(sbbu, t5)),
282  _mm_shuffle_epi8(sbbt, t6));
283 
284  B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), _mm_shuffle_epi8(sbeu, t5)),
285  _mm_shuffle_epi8(sbet, t6));
286 
287  mc = _mm_alignr_epi8(mc, mc, 12);
288  }
289  }
290 
291  template<std::size_t KeyBitsImpl, std::size_t BlockBitsImpl, typename PolicyType>
292  class basic_rijndael_ssse3_impl {
293  BOOST_STATIC_ASSERT(BlockBitsImpl == 128);
294  };
295 
296  template<std::size_t KeyBitsImpl, std::size_t BlockBitsImpl, typename PolicyType>
297  class rijndael_ssse3_impl : public basic_rijndael_ssse3_impl<KeyBitsImpl, BlockBitsImpl, PolicyType> {
298  BOOST_STATIC_ASSERT(BlockBitsImpl == 128);
299  };
300 
301  template<std::size_t KeyBitsImpl, typename PolicyType>
302  class basic_rijndael_ssse3_impl<KeyBitsImpl, 128, PolicyType> {
303  protected:
304  typedef PolicyType policy_type;
305  typedef typename policy_type::block_type block_type;
306  typedef typename policy_type::key_schedule_type key_schedule_type;
307 
308  BOOST_STATIC_ASSERT(PolicyType::key_bits == KeyBitsImpl);
309  BOOST_STATIC_ASSERT(PolicyType::block_bits == 128);
310 
311  public:
312  static block_type encrypt_block(const block_type &plaintext,
313  const key_schedule_type &encryption_key) {
314  block_type out = {0};
315 
316  const __m128i *in_mm = reinterpret_cast<const __m128i *>(plaintext.data());
317  __m128i *out_mm = reinterpret_cast<__m128i *>(out.data());
318 
319  const __m128i *keys = reinterpret_cast<const __m128i *>(encryption_key.data());
320 
321  using namespace nil::crypto3::detail;
322 // poison(plaintext.data(), policy_type::block_bytes);
323 
324  __m128i B = _mm_loadu_si128(in_mm);
325  _mm_storeu_si128(out_mm, detail::aes_ssse3_encrypt(B, keys, policy_type::rounds));
326 
327 // unpoison(plaintext.data(), policy_type::block_bytes);
328 // unpoison(out.data(), policy_type::block_bytes);
329 
330  return out;
331  }
332 
333  static block_type decrypt_block(const block_type &plaintext,
334  const key_schedule_type &decryption_key) {
335  block_type out = {0};
336 
337  const __m128i *in_mm = reinterpret_cast<const __m128i *>(plaintext.data());
338  __m128i *out_mm = reinterpret_cast<__m128i *>(out.data());
339 
340  const __m128i *keys = reinterpret_cast<const __m128i *>(decryption_key.data());
341 
342  using namespace nil::crypto3::detail;
343 // poison(plaintext.data(), policy_type::block_bytes);
344 
345  __m128i B = _mm_loadu_si128(in_mm);
346  _mm_storeu_si128(out_mm, detail::aes_ssse3_decrypt(B, keys, policy_type::rounds));
347 
348 // unpoison(plaintext.data(), policy_type::block_bytes);
349 // unpoison(out.data(), policy_type::block_bytes);
350 
351  return out;
352  }
353  };
354 
355  template<typename PolicyType>
356  class rijndael_ssse3_impl<128, 128, PolicyType>
357  : public basic_rijndael_ssse3_impl<128, 128, PolicyType> {
358  protected:
359  typedef typename basic_rijndael_ssse3_impl<128, 128, PolicyType>::policy_type policy_type;
360 
361  typedef typename policy_type::block_type block_type;
362  typedef typename policy_type::key_type key_type;
363  typedef typename policy_type::key_schedule_type key_schedule_type;
364 
365  public:
366  static void schedule_key(const key_type &input_key, key_schedule_type &encryption_key,
367  key_schedule_type &decryption_key) {
368  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6);
369 
370  __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input_key.data()));
371 
372  __m128i *encryption_key_mm = reinterpret_cast<__m128i *>(encryption_key.data());
373  __m128i *decryption_key_mm = reinterpret_cast<__m128i *>(decryption_key.data());
374 
375  _mm_storeu_si128(decryption_key_mm + policy_type::rounds, _mm_shuffle_epi8(key, detail::sr[2]));
376 
377  key = detail::aes_schedule_transform(key, detail::k_ipt1, detail::k_ipt2);
378 
379  _mm_storeu_si128(encryption_key_mm, key);
380 
381 
382  for (size_t i = 1; i != policy_type::rounds; ++i) {
383  key = detail::aes_schedule_round(&rcon, key, key);
384 
385  _mm_storeu_si128(encryption_key_mm + i, detail::aes_schedule_mangle(key, (12 - i) % 4));
386 
387  _mm_storeu_si128(decryption_key_mm + (policy_type::rounds - i),
388  detail::aes_schedule_mangle_dec(key, (10 - i) % 4));
389  }
390 
391  key = detail::aes_schedule_round(&rcon, key, key);
392  _mm_storeu_si128(encryption_key_mm + policy_type::rounds,
393  detail::aes_schedule_mangle_last(key, 2));
394  _mm_storeu_si128(decryption_key_mm, detail::aes_schedule_mangle_last_dec(key));
395  }
396  };
397 
398  template<typename PolicyType>
399  class rijndael_ssse3_impl<192, 128, PolicyType>
400  : public basic_rijndael_ssse3_impl<192, 128, PolicyType> {
401  protected:
402  typedef typename basic_rijndael_ssse3_impl<192, 128, PolicyType>::policy_type policy_type;
403 
404  typedef typename policy_type::block_type block_type;
405  typedef typename policy_type::key_type key_type;
406  typedef typename policy_type::key_schedule_type key_schedule_type;
407 
408  public:
409  static void schedule_key(const key_type &input_key, key_schedule_type &encryption_key,
410  key_schedule_type &decryption_key) {
411  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6);
412 
413  __m128i *encryption_key_mm = reinterpret_cast<__m128i *>(encryption_key.data());
414  __m128i *decryption_key_mm = reinterpret_cast<__m128i *>(decryption_key.data());
415 
416  __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input_key.data()));
417  __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>((input_key.data() + 8)));
418 
419  _mm_storeu_si128(decryption_key_mm + policy_type::rounds,
420  _mm_shuffle_epi8(key1, detail::sr[0]));
421 
422  key1 = detail::aes_schedule_transform(key1, detail::k_ipt1, detail::k_ipt2);
423  key2 = detail::aes_schedule_transform(key2, detail::k_ipt1, detail::k_ipt2);
424 
425  _mm_storeu_si128(encryption_key_mm + 0, key1);
426 
427  // key2 with 8 high bytes masked off
428  __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
429 
430 
431  for (size_t i = 0; i != 4; ++i) {
432  key2 = detail::aes_schedule_round(&rcon, key2, key1);
433 
434  _mm_storeu_si128(encryption_key_mm + 3 * i + 1,
435  detail::aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i + 3) % 4));
436  _mm_storeu_si128(decryption_key_mm + 11 - 3 * i,
437  detail::aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i + 3) % 4));
438 
439  t = detail::aes_schedule_192_smear(key2, t);
440 
441  _mm_storeu_si128(encryption_key_mm + 3 * i + 2,
442  detail::aes_schedule_mangle(t, (i + 2) % 4));
443  _mm_storeu_si128(decryption_key_mm + 10 - 3 * i,
444  detail::aes_schedule_mangle_dec(t, (i + 2) % 4));
445 
446  key2 = detail::aes_schedule_round(&rcon, t, key2);
447 
448  if (i == 3) {
449  _mm_storeu_si128(encryption_key_mm + 3 * i + 3,
450  detail::aes_schedule_mangle_last(key2, (i + 1) % 4));
451  _mm_storeu_si128(decryption_key_mm + 9 - 3 * i,
452  detail::aes_schedule_mangle_last_dec(key2));
453  } else {
454  _mm_storeu_si128(encryption_key_mm + 3 * i + 3,
455  detail::aes_schedule_mangle(key2, (i + 1) % 4));
456  _mm_storeu_si128(decryption_key_mm + 9 - 3 * i,
457  detail::aes_schedule_mangle_dec(key2, (i + 1) % 4));
458  }
459 
460  key1 = key2;
461  key2 = detail::aes_schedule_192_smear(key2, _mm_slli_si128(_mm_srli_si128(t, 8), 8));
462  t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
463  }
464  }
465  };
466 
467  template<typename PolicyType>
468  class rijndael_ssse3_impl<256, 128, PolicyType>
469  : public basic_rijndael_ssse3_impl<256, 128, PolicyType> {
470  protected:
471  typedef typename basic_rijndael_ssse3_impl<256, 128, PolicyType>::policy_type policy_type;
472 
473  typedef typename policy_type::block_type block_type;
474  typedef typename policy_type::key_type key_type;
475  typedef typename policy_type::key_schedule_type key_schedule_type;
476 
477  public:
478  static void schedule_key(const key_type &input_key, key_schedule_type &encryption_key,
479  key_schedule_type &decryption_key) {
480  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6);
481 
482  __m128i *encryption_key_mm = reinterpret_cast<__m128i *>(encryption_key.data());
483  __m128i *decryption_key_mm = reinterpret_cast<__m128i *>(decryption_key.data());
484 
485  __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input_key.data()));
486  __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>((input_key.data() + 16)));
487 
488  _mm_storeu_si128(decryption_key_mm + policy_type::rounds,
489  _mm_shuffle_epi8(key1, detail::sr[2]));
490 
491  key1 = detail::aes_schedule_transform(key1, detail::k_ipt1, detail::k_ipt2);
492  key2 = detail::aes_schedule_transform(key2, detail::k_ipt1, detail::k_ipt2);
493 
494  _mm_storeu_si128(encryption_key_mm + 0, key1);
495  _mm_storeu_si128(encryption_key_mm + 1, detail::aes_schedule_mangle(key2, 3));
496 
497  _mm_storeu_si128(decryption_key_mm + 13, detail::aes_schedule_mangle_dec(key2, 1));
498 
499 
500  for (size_t i = 2; i != 14; i += 2) {
501  __m128i k_t = key2;
502  key1 = key2 = detail::aes_schedule_round(&rcon, key2, key1);
503 
504  _mm_storeu_si128(encryption_key_mm + i, detail::aes_schedule_mangle(key2, i % 4));
505  _mm_storeu_si128(decryption_key_mm + (14 - i),
506  detail::aes_schedule_mangle_dec(key2, (i + 2) % 4));
507 
508  key2 = detail::aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t);
509  _mm_storeu_si128(encryption_key_mm + i + 1, detail::aes_schedule_mangle(key2, (i - 1) % 4));
510  _mm_storeu_si128(decryption_key_mm + (13 - i),
511  detail::aes_schedule_mangle_dec(key2, (i + 1) % 4));
512  }
513 
514  key2 = detail::aes_schedule_round(&rcon, key2, key1);
515 
516  _mm_storeu_si128(encryption_key_mm + 14, detail::aes_schedule_mangle_last(key2, 2));
517  _mm_storeu_si128(decryption_key_mm + 0, detail::aes_schedule_mangle_last_dec(key2));
518  }
519  };
520 
524  } // namespace detail
525  } // namespace block
526  } // namespace crypto3
527 } // namespace nil
528 
529 #endif // CRYPTO3_SSSE3_RIJNDAEL_IMPL_HPP
boost::mpl::apply< AccumulatorSet, tag::block< Mode > >::type::result_type block(const AccumulatorSet &acc)
Definition: accumulators/block.hpp:259
Definition: algebra/include/nil/crypto3/detail/make_array.hpp:33
Definition: pair.hpp:31