Replace all mdrun rngs with cycle based rng
[alexxy/gromacs.git] / src / external / Random123-1.08 / include / Random123 / threefry.h
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8
9 * Redistributions of source code must retain the above copyright
10   notice, this list of conditions, and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above copyright
13   notice, this list of conditions, and the following disclaimer in the
14   documentation and/or other materials provided with the distribution.
15
16 * Neither the name of D. E. Shaw Research nor the names of its
17   contributors may be used to endorse or promote products derived from
18   this software without specific prior written permission.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _threefry_dot_h_
33 #define _threefry_dot_h_
34 #include "features/compilerfeatures.h"
35 #include "array.h"
36
37 /** \cond HIDDEN_FROM_DOXYGEN */
38 /* Significant parts of this file were copied from
39    from:
40       Skein_FinalRnd/ReferenceImplementation/skein.h
41       Skein_FinalRnd/ReferenceImplementation/skein_block.c
42
43    in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44
45    This file has been modified so that it may no longer perform its originally
46    intended function.  If you're looking for a Skein or Threefish source code,
47    please consult the original file.
48
49    The original file had the following header:
50 **************************************************************************
51 **
52 ** Interface declarations and internal definitions for Skein hashing.
53 **
54 ** Source code author: Doug Whiting, 2008.
55 **
56 ** This algorithm and source code is released to the public domain.
57 **
58 ***************************************************************************
59
60 */
61
62 /* See comment at the top of philox.h for the macro pre-process
63    strategy. */
64
65 /* Rotation constants: */
66 enum r123_enum_threefry64x4 {
67     /* These are the R_256 constants from the Threefish reference sources
68        with names changed to R_64x4... */
69     R_64x4_0_0=14, R_64x4_0_1=16,
70     R_64x4_1_0=52, R_64x4_1_1=57,
71     R_64x4_2_0=23, R_64x4_2_1=40,
72     R_64x4_3_0= 5, R_64x4_3_1=37,
73     R_64x4_4_0=25, R_64x4_4_1=33,
74     R_64x4_5_0=46, R_64x4_5_1=12,
75     R_64x4_6_0=58, R_64x4_6_1=22,
76     R_64x4_7_0=32, R_64x4_7_1=32
77 };
78
79 enum r123_enum_threefry64x2 {
80     /*
81     // Output from skein_rot_search: (srs64_B64-X1000)
82     // Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
83     // Start: Tue Mar  1 10:07:48 2011
84     // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format   
85     */
86     R_64x2_0_0=16,
87     R_64x2_1_0=42,
88     R_64x2_2_0=12,
89     R_64x2_3_0=31,
90     R_64x2_4_0=16,
91     R_64x2_5_0=32,
92     R_64x2_6_0=24,
93     R_64x2_7_0=21
94     /* 4 rounds: minHW =  4  [  4  4  4  4 ]
95     // 5 rounds: minHW =  8  [  8  8  8  8 ]
96     // 6 rounds: minHW = 16  [ 16 16 16 16 ]
97     // 7 rounds: minHW = 32  [ 32 32 32 32 ]
98     // 8 rounds: minHW = 64  [ 64 64 64 64 ]
99     // 9 rounds: minHW = 64  [ 64 64 64 64 ]
100     //10 rounds: minHW = 64  [ 64 64 64 64 ]
101     //11 rounds: minHW = 64  [ 64 64 64 64 ] */
102 };
103
104 enum r123_enum_threefry32x4 {
105     /* Output from skein_rot_search: (srs-B128-X5000.out)
106     // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
107     // Start: Mon Aug 24 22:41:36 2009
108     // ...
109     // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format    */
110     R_32x4_0_0=10, R_32x4_0_1=26,
111     R_32x4_1_0=11, R_32x4_1_1=21,
112     R_32x4_2_0=13, R_32x4_2_1=27,
113     R_32x4_3_0=23, R_32x4_3_1= 5,
114     R_32x4_4_0= 6, R_32x4_4_1=20,
115     R_32x4_5_0=17, R_32x4_5_1=11,
116     R_32x4_6_0=25, R_32x4_6_1=10,
117     R_32x4_7_0=18, R_32x4_7_1=20
118
119     /* 4 rounds: minHW =  3  [  3  3  3  3 ]
120     // 5 rounds: minHW =  7  [  7  7  7  7 ]
121     // 6 rounds: minHW = 12  [ 13 12 13 12 ]
122     // 7 rounds: minHW = 22  [ 22 23 22 23 ]
123     // 8 rounds: minHW = 31  [ 31 31 31 31 ]
124     // 9 rounds: minHW = 32  [ 32 32 32 32 ]
125     //10 rounds: minHW = 32  [ 32 32 32 32 ]
126     //11 rounds: minHW = 32  [ 32 32 32 32 ] */
127
128 };
129
130 enum r123_enum_threefry32x2 {
131     /* Output from skein_rot_search (srs32x2-X5000.out)
132     // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
133     // Start: Tue Jul 12 11:11:33 2011
134     // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format   */
135     R_32x2_0_0=13,
136     R_32x2_1_0=15,
137     R_32x2_2_0=26,
138     R_32x2_3_0= 6,
139     R_32x2_4_0=17,
140     R_32x2_5_0=29,
141     R_32x2_6_0=16,
142     R_32x2_7_0=24
143
144     /* 4 rounds: minHW =  4  [  4  4  4  4 ]
145     // 5 rounds: minHW =  6  [  6  8  6  8 ]
146     // 6 rounds: minHW =  9  [  9 12  9 12 ]
147     // 7 rounds: minHW = 16  [ 16 24 16 24 ]
148     // 8 rounds: minHW = 32  [ 32 32 32 32 ]
149     // 9 rounds: minHW = 32  [ 32 32 32 32 ]
150     //10 rounds: minHW = 32  [ 32 32 32 32 ]
151     //11 rounds: minHW = 32  [ 32 32 32 32 ] */
152     };
153
154 enum r123_enum_threefry_wcnt {
155     WCNT2=2,
156     WCNT4=4
157 };
158 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
159 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
160 {
161     return (x << (N & 63)) | (x >> ((64-N) & 63));
162 }
163     
164 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
165 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
166 {
167     return (x << (N & 31)) | (x >> ((32-N) & 31));
168 }
169
170 #define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
171 #define SKEIN_KS_PARITY64         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
172 #define SKEIN_KS_PARITY32         0x1BD11BDA
173
174 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
175 #define THREEFRY2x32_DEFAULT_ROUNDS 20
176 #endif
177
178 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
179 #define THREEFRY2x64_DEFAULT_ROUNDS 20
180 #endif
181
182 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
183 #define THREEFRY4x32_DEFAULT_ROUNDS 20
184 #endif
185
186 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
187 #define THREEFRY4x64_DEFAULT_ROUNDS 20
188 #endif
189
190 #define _threefry2x_tpl(W)                                              \
191 typedef struct r123array2x##W threefry2x##W##_ctr_t;                          \
192 typedef struct r123array2x##W threefry2x##W##_key_t;                          \
193 typedef struct r123array2x##W threefry2x##W##_ukey_t;                          \
194 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
195 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
196 R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
197 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
198     threefry2x##W##_ctr_t X;                                              \
199     uint##W##_t ks[2+1];                                          \
200     int  i; /* avoid size_t to avoid need for stddef.h */                   \
201     R123_ASSERT(Nrounds<=32);                                           \
202     ks[2] =  SKEIN_KS_PARITY##W;                                   \
203     for (i=0;i < 2; i++)                                        \
204         {                                                               \
205             ks[i] = k.v[i];                                             \
206             X.v[i]  = in.v[i];                                          \
207             ks[2] ^= k.v[i];                                    \
208         }                                                               \
209                                                                         \
210     /* Insert initial key before round 0 */                             \
211     X.v[0] += ks[0]; X.v[1] += ks[1];                                   \
212                                                                         \
213     if(Nrounds>0){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
214     if(Nrounds>1){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
215     if(Nrounds>2){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
216     if(Nrounds>3){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
217     if(Nrounds>3){                                                      \
218         /* InjectKey(r=1) */                                            \
219         X.v[0] += ks[1]; X.v[1] += ks[2];                               \
220         X.v[1] += 1;     /* X.v[2-1] += r  */                   \
221     }                                                                   \
222     if(Nrounds>4){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
223     if(Nrounds>5){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
224     if(Nrounds>6){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
225     if(Nrounds>7){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
226     if(Nrounds>7){                                                      \
227         /* InjectKey(r=2) */                                            \
228         X.v[0] += ks[2]; X.v[1] += ks[0];                               \
229         X.v[1] += 2;                                                    \
230     }                                                                   \
231     if(Nrounds>8){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
232     if(Nrounds>9){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
233     if(Nrounds>10){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
234     if(Nrounds>11){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
235     if(Nrounds>11){                                                     \
236         /* InjectKey(r=3) */                                            \
237         X.v[0] += ks[0]; X.v[1] += ks[1];                               \
238         X.v[1] += 3;                                                    \
239     }                                                                   \
240     if(Nrounds>12){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
241     if(Nrounds>13){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
242     if(Nrounds>14){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
243     if(Nrounds>15){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
244     if(Nrounds>15){                                                     \
245         /* InjectKey(r=4) */                                            \
246         X.v[0] += ks[1]; X.v[1] += ks[2];                               \
247         X.v[1] += 4;                                                    \
248     }                                                                   \
249     if(Nrounds>16){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
250     if(Nrounds>17){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
251     if(Nrounds>18){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
252     if(Nrounds>19){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
253     if(Nrounds>19){                                                     \
254         /* InjectKey(r=5) */                                            \
255         X.v[0] += ks[2]; X.v[1] += ks[0];                               \
256         X.v[1] += 5;                                                    \
257     }                                                                   \
258     if(Nrounds>20){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
259     if(Nrounds>21){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
260     if(Nrounds>22){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
261     if(Nrounds>23){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
262     if(Nrounds>23){                                                     \
263         /* InjectKey(r=6) */                                            \
264         X.v[0] += ks[0]; X.v[1] += ks[1];                               \
265         X.v[1] += 6;                                                    \
266     }                                                                   \
267     if(Nrounds>24){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
268     if(Nrounds>25){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
269     if(Nrounds>26){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
270     if(Nrounds>27){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
271     if(Nrounds>27){                                                     \
272         /* InjectKey(r=7) */                                            \
273         X.v[0] += ks[1]; X.v[1] += ks[2];                               \
274         X.v[1] += 7;                                                    \
275     }                                                                   \
276     if(Nrounds>28){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
277     if(Nrounds>29){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
278     if(Nrounds>30){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
279     if(Nrounds>31){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
280     if(Nrounds>31){                                                     \
281         /* InjectKey(r=8) */                                            \
282         X.v[0] += ks[2]; X.v[1] += ks[0];                               \
283         X.v[1] += 8;                                                    \
284     }                                                                   \
285     return X;                                                           \
286 }                                                                       \
287  /** @ingroup ThreefryNxW */                                            \
288 enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS };       \
289 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
290 R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
291 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
292     return threefry2x##W##_R(threefry2x##W##_rounds, in, k);            \
293 }
294
295
296 #define _threefry4x_tpl(W)                                              \
297 typedef struct r123array4x##W threefry4x##W##_ctr_t;                        \
298 typedef struct r123array4x##W threefry4x##W##_key_t;                        \
299 typedef struct r123array4x##W threefry4x##W##_ukey_t;                        \
300 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
301 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
302 R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
303 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
304     threefry4x##W##_ctr_t X;                                            \
305     uint##W##_t ks[4+1];                                            \
306     int  i; /* avoid size_t to avoid need for stddef.h */                   \
307     R123_ASSERT(Nrounds<=72);                                           \
308     ks[4] =  SKEIN_KS_PARITY##W;                                    \
309     for (i=0;i < 4; i++)                                            \
310         {                                                               \
311             ks[i] = k.v[i];                                             \
312             X.v[i]  = in.v[i];                                          \
313             ks[4] ^= k.v[i];                                        \
314         }                                                               \
315                                                                         \
316     /* Insert initial key before round 0 */                             \
317     X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
318                                                                         \
319     if(Nrounds>0){                                                      \
320         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
321         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
322     }                                                                   \
323     if(Nrounds>1){                                                      \
324         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
325         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
326     }                                                                   \
327     if(Nrounds>2){                                                      \
328         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
329         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
330     }                                                                   \
331     if(Nrounds>3){                                                      \
332         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
333         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
334     }                                                                   \
335     if(Nrounds>3){                                                      \
336         /* InjectKey(r=1) */                                            \
337         X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
338         X.v[4-1] += 1;     /* X.v[WCNT4-1] += r  */                 \
339     }                                                                   \
340                                                                         \
341     if(Nrounds>4){                                                      \
342         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
343         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
344     }                                                                   \
345     if(Nrounds>5){                                                      \
346         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
347         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
348     }                                                                   \
349     if(Nrounds>6){                                                      \
350         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
351         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
352     }                                                                   \
353     if(Nrounds>7){                                                      \
354         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
355         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
356     }                                                                   \
357     if(Nrounds>7){                                                      \
358         /* InjectKey(r=2) */                                            \
359         X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
360         X.v[4-1] += 2;     /* X.v[WCNT4-1] += r  */                 \
361     }                                                                   \
362                                                                         \
363     if(Nrounds>8){                                                      \
364         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
365         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
366     }                                                                   \
367     if(Nrounds>9){                                                      \
368         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
369         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
370     }                                                                   \
371     if(Nrounds>10){                                                     \
372         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
373         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
374     }                                                                   \
375     if(Nrounds>11){                                                     \
376         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
377         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
378     }                                                                   \
379     if(Nrounds>11){                                                     \
380         /* InjectKey(r=3) */                                            \
381         X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
382         X.v[4-1] += 3;     /* X.v[WCNT4-1] += r  */                 \
383     }                                                                   \
384                                                                         \
385     if(Nrounds>12){                                                     \
386         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
387         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
388     }                                                                   \
389     if(Nrounds>13){                                                     \
390         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
391         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
392     }                                                                   \
393     if(Nrounds>14){                                                     \
394         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
395         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
396     }                                                                   \
397     if(Nrounds>15){                                                     \
398         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
399         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
400     }                                                                   \
401     if(Nrounds>15){                                                     \
402         /* InjectKey(r=1) */                                            \
403         X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
404         X.v[4-1] += 4;     /* X.v[WCNT4-1] += r  */                 \
405     }                                                                   \
406                                                                         \
407     if(Nrounds>16){                                                     \
408         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
409         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
410     }                                                                   \
411     if(Nrounds>17){                                                     \
412         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
413         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
414     }                                                                   \
415     if(Nrounds>18){                                                     \
416         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
417         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
418     }                                                                   \
419     if(Nrounds>19){                                                     \
420         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
421         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
422     }                                                                   \
423     if(Nrounds>19){                                                     \
424         /* InjectKey(r=1) */                                            \
425         X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
426         X.v[4-1] += 5;     /* X.v[WCNT4-1] += r  */                 \
427     }                                                                   \
428                                                                         \
429     if(Nrounds>20){                                                     \
430         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
431         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
432     }                                                                   \
433     if(Nrounds>21){                                                     \
434         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
435         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
436     }                                                                   \
437     if(Nrounds>22){                                                     \
438         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
439         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
440     }                                                                   \
441     if(Nrounds>23){                                                     \
442         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
443         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
444     }                                                                   \
445     if(Nrounds>23){                                                     \
446         /* InjectKey(r=1) */                                            \
447         X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
448         X.v[4-1] += 6;     /* X.v[WCNT4-1] += r  */                 \
449     }                                                                   \
450                                                                         \
451     if(Nrounds>24){                                                     \
452         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
453         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
454     }                                                                   \
455     if(Nrounds>25){                                                     \
456         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
457         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
458     }                                                                   \
459     if(Nrounds>26){                                                     \
460         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
461         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
462     }                                                                   \
463     if(Nrounds>27){                                                     \
464         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
465         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
466     }                                                                   \
467     if(Nrounds>27){                                                     \
468         /* InjectKey(r=1) */                                            \
469         X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
470         X.v[4-1] += 7;     /* X.v[WCNT4-1] += r  */                 \
471     }                                                                   \
472                                                                         \
473     if(Nrounds>28){                                                     \
474         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
475         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
476     }                                                                   \
477     if(Nrounds>29){                                                     \
478         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
479         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
480     }                                                                   \
481     if(Nrounds>30){                                                     \
482         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
483         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
484     }                                                                   \
485     if(Nrounds>31){                                                     \
486         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
487         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
488     }                                                                   \
489     if(Nrounds>31){                                                     \
490         /* InjectKey(r=1) */                                            \
491         X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
492         X.v[4-1] += 8;     /* X.v[WCNT4-1] += r  */                 \
493     }                                                                   \
494                                                                         \
495     if(Nrounds>32){                                                     \
496         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
497         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
498     }                                                                   \
499     if(Nrounds>33){                                                     \
500         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
501         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
502     }                                                                   \
503     if(Nrounds>34){                                                     \
504         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
505         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
506     }                                                                   \
507     if(Nrounds>35){                                                     \
508         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
509         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
510     }                                                                   \
511     if(Nrounds>35){                                                     \
512         /* InjectKey(r=1) */                                            \
513         X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
514         X.v[4-1] += 9;     /* X.v[WCNT4-1] += r  */                 \
515     }                                                                   \
516                                                                         \
517     if(Nrounds>36){                                                     \
518         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
519         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
520     }                                                                   \
521     if(Nrounds>37){                                                     \
522         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
523         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
524     }                                                                   \
525     if(Nrounds>38){                                                     \
526         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
527         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
528     }                                                                   \
529     if(Nrounds>39){                                                     \
530         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
531         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
532     }                                                                   \
533     if(Nrounds>39){                                                     \
534         /* InjectKey(r=1) */                                            \
535         X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
536         X.v[4-1] += 10;     /* X.v[WCNT4-1] += r  */                 \
537     }                                                                   \
538                                                                         \
539     if(Nrounds>40){                                                     \
540         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
541         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
542     }                                                                   \
543     if(Nrounds>41){                                                     \
544         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
545         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
546     }                                                                   \
547     if(Nrounds>42){                                                     \
548         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
549         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
550     }                                                                   \
551     if(Nrounds>43){                                                     \
552         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
553         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
554     }                                                                   \
555     if(Nrounds>43){                                                     \
556         /* InjectKey(r=1) */                                            \
557         X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
558         X.v[4-1] += 11;     /* X.v[WCNT4-1] += r  */                \
559     }                                                                   \
560                                                                         \
561     if(Nrounds>44){                                                     \
562         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
563         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
564     }                                                                   \
565     if(Nrounds>45){                                                     \
566         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
567         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
568     }                                                                   \
569     if(Nrounds>46){                                                     \
570         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
571         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
572     }                                                                   \
573     if(Nrounds>47){                                                     \
574         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
575         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
576     }                                                                   \
577     if(Nrounds>47){                                                     \
578         /* InjectKey(r=1) */                                            \
579         X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
580         X.v[4-1] += 12;     /* X.v[WCNT4-1] += r  */                 \
581     }                                                                   \
582                                                                         \
583     if(Nrounds>48){                                                     \
584         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
585         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
586     }                                                                   \
587     if(Nrounds>49){                                                     \
588         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
589         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
590     }                                                                   \
591     if(Nrounds>50){                                                     \
592         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
593         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
594     }                                                                   \
595     if(Nrounds>51){                                                     \
596         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
597         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
598     }                                                                   \
599     if(Nrounds>51){                                                     \
600         /* InjectKey(r=1) */                                            \
601         X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
602         X.v[4-1] += 13;     /* X.v[WCNT4-1] += r  */                 \
603     }                                                                   \
604                                                                         \
605     if(Nrounds>52){                                                     \
606         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
607         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
608     }                                                                   \
609     if(Nrounds>53){                                                     \
610         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
611         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
612     }                                                                   \
613     if(Nrounds>54){                                                     \
614         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
615         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
616     }                                                                   \
617     if(Nrounds>55){                                                     \
618         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
619         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
620     }                                                                   \
621     if(Nrounds>55){                                                     \
622         /* InjectKey(r=1) */                                            \
623         X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
624         X.v[4-1] += 14;     /* X.v[WCNT4-1] += r  */                 \
625     }                                                                   \
626                                                                         \
627     if(Nrounds>56){                                                     \
628         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
629         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
630     }                                                                   \
631     if(Nrounds>57){                                                     \
632         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
633         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
634     }                                                                   \
635     if(Nrounds>58){                                                     \
636         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
637         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
638     }                                                                   \
639     if(Nrounds>59){                                                     \
640         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
641         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
642     }                                                                   \
643     if(Nrounds>59){                                                     \
644         /* InjectKey(r=1) */                                            \
645         X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
646         X.v[4-1] += 15;     /* X.v[WCNT4-1] += r  */                 \
647     }                                                                   \
648                                                                         \
649     if(Nrounds>60){                                                     \
650         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
651         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
652     }                                                                   \
653     if(Nrounds>61){                                                     \
654         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
655         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
656     }                                                                   \
657     if(Nrounds>62){                                                     \
658         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
659         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
660     }                                                                   \
661     if(Nrounds>63){                                                     \
662         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
663         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
664     }                                                                   \
665     if(Nrounds>63){                                                     \
666         /* InjectKey(r=1) */                                            \
667         X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
668         X.v[4-1] += 16;     /* X.v[WCNT4-1] += r  */                 \
669     }                                                                   \
670                                                                         \
671     if(Nrounds>64){                                                     \
672         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
673         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
674     }                                                                   \
675     if(Nrounds>65){                                                     \
676         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
677         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
678     }                                                                   \
679     if(Nrounds>66){                                                     \
680         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
681         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
682     }                                                                   \
683     if(Nrounds>67){                                                     \
684         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
685         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
686     }                                                                   \
687     if(Nrounds>67){                                                     \
688         /* InjectKey(r=1) */                                            \
689         X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
690         X.v[4-1] += 17;     /* X.v[WCNT4-1] += r  */                 \
691     }                                                                   \
692                                                                         \
693     if(Nrounds>68){                                                     \
694         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
695         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
696     }                                                                   \
697     if(Nrounds>69){                                                     \
698         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
699         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
700     }                                                                   \
701     if(Nrounds>70){                                                     \
702         X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
703         X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
704     }                                                                   \
705     if(Nrounds>71){                                                     \
706         X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
707         X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
708     }                                                                   \
709     if(Nrounds>71){                                                     \
710         /* InjectKey(r=1) */                                            \
711         X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
712         X.v[4-1] += 18;     /* X.v[WCNT4-1] += r  */                 \
713     }                                                                   \
714                                                                         \
715     return X;                                                           \
716 }                                                                       \
717  /** @ingroup ThreefryNxW */                                            \
718 enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS };       \
719 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
720 R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
721 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
722     return threefry4x##W##_R(threefry4x##W##_rounds, in, k);            \
723 }
724 /** \endcond */
725
726 _threefry2x_tpl(64)
727 _threefry2x_tpl(32)
728 _threefry4x_tpl(64)
729 _threefry4x_tpl(32)
730
731 /* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
732    than a static inline function.  Why?  */
733 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
734 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
735 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
736 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
737
738 #ifdef __cplusplus
739 /** \cond HIDDEN_FROM_DOXYGEN */
740 #define _threefryNxWclass_tpl(NxW)                                      \
741 namespace r123{                                                     \
742 template<unsigned int R>                                                  \
743  struct Threefry##NxW##_R{                                              \
744     typedef threefry##NxW##_ctr_t ctr_type;                             \
745     typedef threefry##NxW##_key_t key_type;                             \
746     typedef threefry##NxW##_key_t ukey_type;                            \
747     static const unsigned int rounds=R;                                 \
748    inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
749         R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
750         return threefry##NxW##_R(R, ctr, key);                              \
751     }                                                                   \
752 };                                                                      \
753  typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;       \
754 } // namespace r123
755
756 /** \endcond */
757
758 _threefryNxWclass_tpl(2x32)
759 _threefryNxWclass_tpl(4x32)
760 _threefryNxWclass_tpl(2x64)
761 _threefryNxWclass_tpl(4x64)
762
763 /* The _tpl macros don't quite work to do string-pasting inside comments.
764    so we just write out the boilerplate documentation four times... */
765
766 /** 
767 @defgroup ThreefryNxW Threefry Classes and Typedefs
768
769 The ThreefryNxW classes export the member functions, typedefs and
770 operator overloads required by a @ref CBRNG "CBRNG" class.
771
772 As described in  
773 <a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>, 
774 the Threefry family is closely related to the Threefish block cipher from
775 <a href="http://www.skein-hash.info/"> Skein Hash Function</a>.  
776 Threefry is \b not suitable for cryptographic use.
777
778 Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
779
780 @class r123::Threefry2x32_R 
781 @ingroup ThreefryNxW
782
783 exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
784
785 The template argument, ROUNDS, is the number of times the Threefry round
786 function will be applied.
787
788 As of September 2011, the authors know of no statistical flaws with
789 ROUNDS=13 or more for Threefry2x32.
790
791 @typedef r123::Threefry2x32
792 @ingroup ThreefryNxW
793   Threefry2x32 is equivalent to Threefry2x32_R<20>.    With 20 rounds,
794   Threefry2x32 has a considerable safety margin over the minimum number
795   of rounds with no known statistical flaws, but still has excellent
796    performance. 
797
798 @class r123::Threefry2x64_R 
799 @ingroup ThreefryNxW
800
801 exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
802
803 The template argument, ROUNDS, is the number of times the Threefry round
804 function will be applied.
805
806 In November 2011, the authors discovered that 13 rounds of
807 Threefry2x64 sequenced by strided, interleaved key and counter
808 increments failed a very long (longer than the default BigCrush
809 length) WeightDistrub test.  At the same time, it was confirmed that
810 14 rounds passes much longer tests (up to 5x10^12 samples) of a
811 similar nature.  The authors know of no statistical flaws with
812 ROUNDS=14 or more for Threefry2x64.
813
814 @typedef r123::Threefry2x64
815 @ingroup ThreefryNxW
816   Threefry2x64 is equivalent to Threefry2x64_R<20>.    With 20 rounds,
817   Threefry2x64 has a considerable safety margin over the minimum number
818   of rounds with no known statistical flaws, but still has excellent
819    performance. 
820
821
822
823 @class r123::Threefry4x32_R 
824 @ingroup ThreefryNxW
825
826 exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
827
828 The template argument, ROUNDS, is the number of times the Threefry round
829 function will be applied.
830
831 As of September 2011, the authors know of no statistical flaws with
832 ROUNDS=12 or more for Threefry4x32.
833
834 @typedef r123::Threefry4x32
835 @ingroup ThreefryNxW
836   Threefry4x32 is equivalent to Threefry4x32_R<20>.    With 20 rounds,
837   Threefry4x32 has a considerable safety margin over the minimum number
838   of rounds with no known statistical flaws, but still has excellent
839    performance. 
840
841
842
843 @class r123::Threefry4x64_R 
844 @ingroup ThreefryNxW
845
846 exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
847
848 The template argument, ROUNDS, is the number of times the Threefry round
849 function will be applied.
850
851 As of September 2011, the authors know of no statistical flaws with
852 ROUNDS=12 or more for Threefry4x64.
853
854 @typedef r123::Threefry4x64
855 @ingroup ThreefryNxW
856   Threefry4x64 is equivalent to Threefry4x64_R<20>.    With 20 rounds,
857   Threefry4x64 has a considerable safety margin over the minimum number
858   of rounds with no known statistical flaws, but still has excellent
859    performance. 
860 */
861
862 #endif
863
864 #endif