src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_256_double_h_
  36 #define _kernelutil_x86_avx_256_double_h_
  37
  38 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  39
  40 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
  41 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  42 #define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
  43 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
  44 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
  45     {                                                        \
  46         __m256d _t0, _t1, _t2, _t3;                          \
  47         _t0  = _mm256_unpacklo_pd((row0), (row1));           \
  48         _t1  = _mm256_unpackhi_pd((row0), (row1));           \
  49         _t2  = _mm256_unpacklo_pd((row2), (row3));           \
  50         _t3  = _mm256_unpackhi_pd((row2), (row3));           \
  51         row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
  52         row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
  53         row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
  54         row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
  55     }
  56
  57 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
  58
  59 static gmx_inline __m256d gmx_simdcall
  60 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
  61 {
  62     return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
  63 }
  64
  65 static gmx_inline __m256d gmx_simdcall
  66 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
  67 {
  68     return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
  69 }
  70
  71 static gmx_inline __m256d gmx_simdcall
  72 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
  73 {
  74     return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
  75 }
  76
  77 static gmx_inline __m256 gmx_simdcall
  78 gmx_mm256_set_m128(__m128 hi, __m128 lo)
  79 {
  80     return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
  81 }
  82
  83 static gmx_inline int gmx_simdcall
  84 gmx_mm256_any_lt(__m256d a, __m256d b)
  85 {
  86     return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  87 }
  88
  89 static gmx_inline __m256d gmx_simdcall
  90 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  91 {
  92     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
  93 }
  94
  95 /* Normal sum of four ymm registers */
  96 #define gmx_mm256_sum4_pd(t0, t1, t2, t3)  _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
  97
  98
  99 /* Load a single value from 1-4 places, merge into xmm register */
 100 static gmx_inline __m256d gmx_simdcall
 101 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
 102 {
 103     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
 104 }
 105
 106 static gmx_inline __m256d gmx_simdcall
 107 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
 108                                 const double * gmx_restrict ptrB)
 109 {
 110     __m128d tA, tB;
 111
 112     tA = _mm_load_sd(ptrA);
 113     tB = _mm_load_sd(ptrB);
 114
 115     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
 116 }
 117
 118
 119 static gmx_inline __m256d gmx_simdcall
 120 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 121                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
 122 {
 123     __m128d t1, t2;
 124
 125     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
 126     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
 127     return gmx_mm256_set_m128d(t2, t1);
 128 }
 129
 130
 131
 132 static gmx_inline void gmx_simdcall
 133 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 134 {
 135     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 136 }
 137
 138
 139 static gmx_inline void gmx_simdcall
 140 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 141 {
 142     __m256d t2;
 143
 144     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 145     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 146     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 147 }
 148
 149
 150
 151
 152 static gmx_inline void gmx_simdcall
 153 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 154                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 155 {
 156     __m256d t2;
 157     __m128d t3, t4;
 158
 159     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 160     t3       = _mm256_extractf128_pd(xmm1, 0x1);
 161     t4       = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 162     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 163     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 164     _mm_store_sd(ptrC, t3);
 165     _mm_store_sd(ptrD, t4);
 166 }
 167
 168
 169
 170
 171 static gmx_inline void gmx_simdcall
 172 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 173 {
 174     __m128d t1;
 175
 176     t1   = _mm256_castpd256_pd128(xmm1);
 177     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 178
 179     _mm_store_sd(ptrA, t1);
 180 }
 181
 182
 183 static gmx_inline void gmx_simdcall
 184 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 185 {
 186     __m128d t1, t2;
 187
 188     t1   = _mm256_castpd256_pd128(xmm1);
 189     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 190
 191     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 192     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 193
 194     _mm_store_sd(ptrA, t1);
 195     _mm_store_sd(ptrB, t2);
 196 }
 197
 198
 199 static gmx_inline void gmx_simdcall
 200 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 201                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 202 {
 203     __m128d t1, t2, t3, t4;
 204
 205     t1   = _mm256_castpd256_pd128(xmm1);
 206     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 207     t3   = _mm256_extractf128_pd(xmm1, 0x1);
 208     t4   = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 209
 210     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 211     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 212     t3   = _mm_add_sd(t3, _mm_load_sd(ptrC));
 213     t4   = _mm_add_sd(t4, _mm_load_sd(ptrD));
 214
 215     _mm_store_sd(ptrA, t1);
 216     _mm_store_sd(ptrB, t2);
 217     _mm_store_sd(ptrC, t3);
 218     _mm_store_sd(ptrD, t4);
 219 }
 220
 221
 222
 223 static gmx_inline void gmx_simdcall
 224 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 225 {
 226     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 227     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 228 }
 229
 230
 231 static gmx_inline void gmx_simdcall
 232 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 233 {
 234     __m128d t1, t2, t3;
 235
 236     t1   = _mm_loadu_pd(p1);
 237     t2   = _mm_loadu_pd(p2);
 238     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
 239     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
 240 }
 241
 242
 243
 244 static gmx_inline void gmx_simdcall
 245 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 246                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 247                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 248 {
 249     __m256d t1, t2;
 250
 251     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 252     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 253
 254     *c6  = _mm256_unpacklo_pd(t1, t2);                              /* c6d c6c | c6b c6a */
 255     *c12 = _mm256_unpackhi_pd(t1, t2);                              /* c12d c12c | c12b c12a */
 256 }
 257
 258
 259 static gmx_inline void gmx_simdcall
 260 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 261                                             const double * gmx_restrict xyz,
 262                                             __m256d * gmx_restrict      x1,
 263                                             __m256d * gmx_restrict      y1,
 264                                             __m256d * gmx_restrict      z1)
 265 {
 266     __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
 267
 268     mem_xy  = _mm_loadu_pd(xyz);
 269     mem_z   = _mm_load_sd(xyz+2);
 270     mem_sxy = _mm_loadu_pd(xyz_shift);
 271     mem_sz  = _mm_load_sd(xyz_shift+2);
 272
 273     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 274     mem_z   = _mm_add_pd(mem_z, mem_sz);
 275
 276     tx  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 277     ty  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 278     tz  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 279
 280     *x1 = gmx_mm256_set_m128d(tx, tx);
 281     *y1 = gmx_mm256_set_m128d(ty, ty);
 282     *z1 = gmx_mm256_set_m128d(tz, tz);
 283 }
 284
 285
 286 static gmx_inline void gmx_simdcall
 287 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 288                                             const double * gmx_restrict xyz,
 289                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 290                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 291                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 292 {
 293     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
 294
 295     t1  = _mm_loadu_pd(xyz);
 296     t2  = _mm_loadu_pd(xyz+2);
 297     t3  = _mm_loadu_pd(xyz+4);
 298     t4  = _mm_loadu_pd(xyz+6);
 299     t5  = _mm_load_sd(xyz+8);
 300
 301     sxy = _mm_loadu_pd(xyz_shift);
 302     sz  = _mm_load_sd(xyz_shift+2);
 303     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 304     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 305
 306     t1  = _mm_add_pd(t1, sxy);
 307     t2  = _mm_add_pd(t2, szx);
 308     t3  = _mm_add_pd(t3, syz);
 309     t4  = _mm_add_pd(t4, sxy);
 310     t5  = _mm_add_sd(t5, sz);
 311
 312     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 313     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 314     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 315     *x1  = gmx_mm256_set_m128d(tx, tx);
 316     *y1  = gmx_mm256_set_m128d(ty, ty);
 317     *z1  = gmx_mm256_set_m128d(tz, tz);
 318     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 319     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 320     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 321     *x2  = gmx_mm256_set_m128d(tx, tx);
 322     *y2  = gmx_mm256_set_m128d(ty, ty);
 323     *z2  = gmx_mm256_set_m128d(tz, tz);
 324     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 325     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 326     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 327     *x3  = gmx_mm256_set_m128d(tx, tx);
 328     *y3  = gmx_mm256_set_m128d(ty, ty);
 329     *z3  = gmx_mm256_set_m128d(tz, tz);
 330 }
 331
 332
 333 static gmx_inline void gmx_simdcall
 334 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 335                                             const double * gmx_restrict xyz,
 336                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 337                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 338                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 339                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 340 {
 341     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
 342
 343     t1  = _mm_loadu_pd(xyz);
 344     t2  = _mm_loadu_pd(xyz+2);
 345     t3  = _mm_loadu_pd(xyz+4);
 346     t4  = _mm_loadu_pd(xyz+6);
 347     t5  = _mm_loadu_pd(xyz+8);
 348     t6  = _mm_loadu_pd(xyz+10);
 349
 350     sxy = _mm_loadu_pd(xyz_shift);
 351     sz  = _mm_load_sd(xyz_shift+2);
 352     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 353     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 354
 355     t1  = _mm_add_pd(t1, sxy);
 356     t2  = _mm_add_pd(t2, szx);
 357     t3  = _mm_add_pd(t3, syz);
 358     t4  = _mm_add_pd(t4, sxy);
 359     t5  = _mm_add_pd(t5, szx);
 360     t6  = _mm_add_pd(t6, syz);
 361
 362     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 363     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 364     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 365     *x1  = gmx_mm256_set_m128d(tx, tx);
 366     *y1  = gmx_mm256_set_m128d(ty, ty);
 367     *z1  = gmx_mm256_set_m128d(tz, tz);
 368     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 369     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 370     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 371     *x2  = gmx_mm256_set_m128d(tx, tx);
 372     *y2  = gmx_mm256_set_m128d(ty, ty);
 373     *z2  = gmx_mm256_set_m128d(tz, tz);
 374     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 375     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 376     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 377     *x3  = gmx_mm256_set_m128d(tx, tx);
 378     *y3  = gmx_mm256_set_m128d(ty, ty);
 379     *z3  = gmx_mm256_set_m128d(tz, tz);
 380     tx   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 381     ty   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 382     tz   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 383     *x4  = gmx_mm256_set_m128d(tx, tx);
 384     *y4  = gmx_mm256_set_m128d(ty, ty);
 385     *z4  = gmx_mm256_set_m128d(tz, tz);
 386 }
 387
 388
 389 static gmx_inline void gmx_simdcall
 390 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 391                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 392 {
 393     __m256d t1;
 394
 395     t1            = _mm256_loadu_pd(p1);
 396     *x            = t1;
 397     *y            = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 398     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 399 }
 400
 401
 402 static gmx_inline void gmx_simdcall
 403 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 404                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 405                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 406                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 407 {
 408     __m256d t1, t2, t3, t4;
 409
 410     t1            = _mm256_loadu_pd(p1);
 411     t3            = _mm256_loadu_pd(p1+4);
 412     *x1           = t1;
 413     *y2           = t3;
 414     t2            = gmx_mm256_unpack128hi_pd(t1, t1);
 415     t4            = gmx_mm256_unpack128hi_pd(t3, t3);
 416     *z1           = t2;
 417     *x3           = t4;
 418     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 419     *z2           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 420     *x2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 421     *y3           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 422     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 423 }
 424
 425 static gmx_inline void gmx_simdcall
 426 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 427                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 428                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 429                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 430                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 431 {
 432     __m256d t1, t2, t3, t4, t5, t6;
 433
 434     t1            = _mm256_loadu_pd(p1);
 435     t2            = _mm256_loadu_pd(p1+4);
 436     t3            = _mm256_loadu_pd(p1+8);
 437
 438     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 439     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
 440     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
 441
 442     *x1           = t1;
 443     *y2           = t2;
 444     *z3           = t3;
 445     *z1           = t4;
 446     *x3           = t5;
 447     *y4           = t6;
 448
 449     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 450     *z2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 451     *x4           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 452     *x2           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 453     *y3           = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 454     *z4           = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 455 }
 456
 457
 458 static gmx_inline void gmx_simdcall
 459 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 460                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 461                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 462 {
 463     __m256d t1, t2, t3, t4, t5, t6;
 464
 465     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 466     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 467     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 468     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 469
 470     t5           = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 471     t6           = _mm256_unpackhi_pd(t1, t2);   /*   -   -  | y1b y1a */
 472     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1c z1c | x1d x1c */
 473     t2           = _mm256_unpackhi_pd(t3, t4);   /*   -   -  | y1d y1c */
 474
 475     *x1          = gmx_mm256_unpack128lo_pd(t5, t1);
 476     *y1          = gmx_mm256_unpack128lo_pd(t6, t2);
 477     *z1          = gmx_mm256_unpack128hi_pd(t5, t1);
 478 }
 479
 480
 481
 482 static gmx_inline void gmx_simdcall
 483 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 484                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 485                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 486                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 487                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 488 {
 489     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 490
 491     t1           = _mm256_loadu_pd(ptrA);                       /*  x2a z1a | y1a x1a */
 492     t2           = _mm256_loadu_pd(ptrB);                       /*  x2b z1b | y1b x1b */
 493     t3           = _mm256_loadu_pd(ptrC);                       /*  x2c z1c | y1c x1c */
 494     t4           = _mm256_loadu_pd(ptrD);                       /*  x2d z1d | y1d x1d */
 495     t5           = _mm256_loadu_pd(ptrA+4);                     /*  y3a x3a | z2a y2a */
 496     t6           = _mm256_loadu_pd(ptrB+4);                     /*  y3b x3b | z2b y2b */
 497     t7           = _mm256_loadu_pd(ptrC+4);                     /*  y3c x3c | z2c y2c */
 498     t8           = _mm256_loadu_pd(ptrD+4);                     /*  y3d x3d | z2d y2d */
 499     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /*   -   -  |  -  z3a */
 500     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /*   -   -  |  -  z3b */
 501     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /*   -   -  |  -  z3c */
 502     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /*   -   -  |  -  z3d */
 503
 504     t13          = _mm256_unpacklo_pd(t1, t2);                  /*  z1b z1a | x1b x1a */
 505     t14          = _mm256_unpackhi_pd(t1, t2);                  /*  x2b x2a | y1b y1a */
 506     t1           = _mm256_unpacklo_pd(t3, t4);                  /*  z1d z1c | x1d x1c */
 507     t2           = _mm256_unpackhi_pd(t3, t4);                  /*  x2d x2c | y1d y1c */
 508
 509     t3           = _mm256_unpacklo_pd(t5, t6);                  /*  x3b x3a | y2b y2a */
 510     t4           = _mm256_unpackhi_pd(t5, t6);                  /*  y3b y3a | z2b z2a */
 511     t5           = _mm256_unpacklo_pd(t7, t8);                  /*  x3d x3c | y2d y2c */
 512     t6           = _mm256_unpackhi_pd(t7, t8);                  /*  y3d y3c | z2d z2c */
 513
 514     t9           = _mm256_unpacklo_pd(t9, t10);                 /*   -   -  | z3b z3a */
 515     t11          = _mm256_unpacklo_pd(t11, t12);                /*   -   -  | z3d z3c */
 516
 517     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 518     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 519     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 520     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 521     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 522     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 523     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 524     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 525     *z3          = gmx_mm256_unpack128lo_pd(t9, t11);
 526 }
 527
 528
 529
 530 static gmx_inline void gmx_simdcall
 531 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 532                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 533                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 534                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 535                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 536                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 537 {
 538     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 539
 540     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 541     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 542     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 543     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 544     t5           = _mm256_loadu_pd(ptrA+4);      /*  y3a x3a | z2a y2a */
 545     t6           = _mm256_loadu_pd(ptrB+4);      /*  y3b x3b | z2b y2b */
 546     t7           = _mm256_loadu_pd(ptrC+4);      /*  y3c x3c | z2c y2c */
 547     t8           = _mm256_loadu_pd(ptrD+4);      /*  y3d x3d | z2d y2d */
 548     t9           = _mm256_loadu_pd(ptrA+8);      /*  z4a y4a | x4a z3a */
 549     t10          = _mm256_loadu_pd(ptrB+8);      /*  z4b y4b | x4b z3b */
 550     t11          = _mm256_loadu_pd(ptrC+8);      /*  z4c y4c | x4c z3c */
 551     t12          = _mm256_loadu_pd(ptrD+8);      /*  z4d y4d | x4d z3d */
 552
 553     t13          = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 554     t14          = _mm256_unpackhi_pd(t1, t2);   /*  x2b x2a | y1b y1a */
 555     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1d z1c | x1d x1c */
 556     t2           = _mm256_unpackhi_pd(t3, t4);   /*  x2d x2c | y1d y1c */
 557
 558     t3           = _mm256_unpacklo_pd(t5, t6);   /*  x3b x3a | y2b y2a */
 559     t4           = _mm256_unpackhi_pd(t5, t6);   /*  y3b y3a | z2b z2a */
 560     t5           = _mm256_unpacklo_pd(t7, t8);   /*  x3d x3c | y2d y2c */
 561     t6           = _mm256_unpackhi_pd(t7, t8);   /*  y3d y3c | z2d z2c */
 562
 563     t7           = _mm256_unpacklo_pd(t9, t10);  /*  y4b y4a | z3b z3a */
 564     t8           = _mm256_unpackhi_pd(t9, t10);  /*  z4b z4a | x4b x4a */
 565     t9           = _mm256_unpacklo_pd(t11, t12); /*  y4d y4c | z3d z3c */
 566     t10          = _mm256_unpackhi_pd(t11, t12); /*  z4d z4c | x4d x4c */
 567
 568     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 569     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 570     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 571     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 572     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 573     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 574     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 575     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 576     *z3          = gmx_mm256_unpack128lo_pd(t7, t9);
 577     *x4          = gmx_mm256_unpack128lo_pd(t8, t10);
 578     *y4          = gmx_mm256_unpack128hi_pd(t7, t9);
 579     *z4          = gmx_mm256_unpack128hi_pd(t8, t10);
 580 }
 581
 582
 583
 584 static gmx_inline void gmx_simdcall
 585 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 586                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 587                                           __m256d x1, __m256d y1, __m256d z1)
 588 {
 589     __m256d t1, t2, tA, tB, tC, tD;
 590     __m256i mask;
 591
 592     t1          = _mm256_unpacklo_pd(x1, y1);       /*  y1c x1c | y1a x1a */
 593     t2          = _mm256_unpackhi_pd(x1, y1);       /*  y1d x1d | y1b x1b */
 594     x1          = gmx_mm256_unpack128lo_pd(t1, z1); /*  -  z1a | y1a x1a */
 595     y1          = gmx_mm256_unpack128hi_pd(t1, z1); /*  -  z1c | y1c x1c */
 596     z1          = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 597     t1          = gmx_mm256_unpack128lo_pd(t2, z1); /*  -  z1b | y1b x1b */
 598     z1          = gmx_mm256_unpack128hi_pd(t2, z1); /*  -  z1d | y1d x1d */
 599
 600     /* Construct a mask without executing any data loads */
 601     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 602                                                       _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
 603
 604     tA          = _mm256_loadu_pd(ptrA);
 605     tB          = _mm256_loadu_pd(ptrB);
 606     tC          = _mm256_loadu_pd(ptrC);
 607     tD          = _mm256_loadu_pd(ptrD);
 608
 609     tA          = _mm256_sub_pd(tA, x1);
 610     tB          = _mm256_sub_pd(tB, t1);
 611     tC          = _mm256_sub_pd(tC, y1);
 612     tD          = _mm256_sub_pd(tD, z1);
 613
 614     _mm256_maskstore_pd(ptrA, mask, tA);
 615     _mm256_maskstore_pd(ptrB, mask, tB);
 616     _mm256_maskstore_pd(ptrC, mask, tC);
 617     _mm256_maskstore_pd(ptrD, mask, tD);
 618 }
 619
 620
 621
 622
 623 static gmx_inline void gmx_simdcall
 624 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 625                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 626                                           __m256d x1, __m256d y1, __m256d z1,
 627                                           __m256d x2, __m256d y2, __m256d z2,
 628                                           __m256d x3, __m256d y3, __m256d z3)
 629 {
 630     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 631     __m128d tA, tB, tC, tD, tE;
 632
 633     t1          = _mm256_loadu_pd(ptrA);
 634     t2          = _mm256_loadu_pd(ptrB);
 635     t3          = _mm256_loadu_pd(ptrC);
 636     t4          = _mm256_loadu_pd(ptrD);
 637     t5          = _mm256_loadu_pd(ptrA+4);
 638     t6          = _mm256_loadu_pd(ptrB+4);
 639     t7          = _mm256_loadu_pd(ptrC+4);
 640     t8          = _mm256_loadu_pd(ptrD+4);
 641     tA          = _mm_load_sd(ptrA+8);
 642     tB          = _mm_load_sd(ptrB+8);
 643     tC          = _mm_load_sd(ptrC+8);
 644     tD          = _mm_load_sd(ptrD+8);
 645
 646     t9          = _mm256_unpacklo_pd(x1, y1);       /* y1c x1c | y1a x1a */
 647     x1          = _mm256_unpackhi_pd(x1, y1);       /* y1d x1d | y1b x1b */
 648
 649     y1          = _mm256_unpacklo_pd(z1, x2);       /* x2c z1c | x2a z1a */
 650     z1          = _mm256_unpackhi_pd(z1, x2);       /* x2d z1d | x2b z1b */
 651
 652     x2          = _mm256_unpacklo_pd(y2, z2);       /* z2c y2c | z2a y2a */
 653     y2          = _mm256_unpackhi_pd(y2, z2);       /* z2d y2d | z2b y2b */
 654
 655     z2          = _mm256_unpacklo_pd(x3, y3);       /* y3c x3c | y3a x3a */
 656     x3          = _mm256_unpackhi_pd(x3, y3);       /* y3d x3d | y3b x3b */
 657
 658     t10         = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
 659     y3          = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
 660
 661     t9          = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
 662     y1          = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
 663
 664     x1          = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
 665     z1          = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
 666
 667     x2          = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
 668     z2          = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
 669
 670     t1          = _mm256_sub_pd(t1, t10);
 671     t2          = _mm256_sub_pd(t2, t9);
 672     t3          = _mm256_sub_pd(t3, y3);
 673     t4          = _mm256_sub_pd(t4, y1);
 674     t5          = _mm256_sub_pd(t5, x1);
 675     t6          = _mm256_sub_pd(t6, x2);
 676     t7          = _mm256_sub_pd(t7, z1);
 677     t8          = _mm256_sub_pd(t8, z2);
 678
 679     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 680     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
 681     tE          = _mm256_extractf128_pd(z3, 0x1);
 682     tC          = _mm_sub_sd(tC, tE);
 683     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
 684
 685     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 686     _mm256_storeu_pd(ptrA, t1);
 687     _mm256_storeu_pd(ptrB, t2);
 688     _mm256_storeu_pd(ptrC, t3);
 689     _mm256_storeu_pd(ptrD, t4);
 690     _mm256_storeu_pd(ptrA+4, t5);
 691     _mm256_storeu_pd(ptrB+4, t6);
 692     _mm256_storeu_pd(ptrC+4, t7);
 693     _mm256_storeu_pd(ptrD+4, t8);
 694     _mm_store_sd(ptrA+8, tA);
 695     _mm_store_sd(ptrB+8, tB);
 696     _mm_store_sd(ptrC+8, tC);
 697     _mm_store_sd(ptrD+8, tD);
 698 }
 699
 700
 701 static gmx_inline void gmx_simdcall
 702 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 703                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 704                                           __m256d x1, __m256d y1, __m256d z1,
 705                                           __m256d x2, __m256d y2, __m256d z2,
 706                                           __m256d x3, __m256d y3, __m256d z3,
 707                                           __m256d x4, __m256d y4, __m256d z4)
 708 {
 709     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 710     __m128d tA, tB, tC, tD, tE;
 711
 712     t1          = _mm256_loadu_pd(ptrA);
 713     t2          = _mm256_loadu_pd(ptrB);
 714     t3          = _mm256_loadu_pd(ptrC);
 715     t4          = _mm256_loadu_pd(ptrD);
 716     t5          = _mm256_loadu_pd(ptrA+4);
 717     t6          = _mm256_loadu_pd(ptrB+4);
 718     t7          = _mm256_loadu_pd(ptrC+4);
 719     t8          = _mm256_loadu_pd(ptrD+4);
 720     t9          = _mm256_loadu_pd(ptrA+8);
 721     t10         = _mm256_loadu_pd(ptrB+8);
 722     t11         = _mm256_loadu_pd(ptrC+8);
 723     t12         = _mm256_loadu_pd(ptrD+8);
 724
 725     t13         = _mm256_unpacklo_pd(x1, y1);        /* y1c x1c | y1a x1a */
 726     x1          = _mm256_unpackhi_pd(x1, y1);        /* y1d x1d | y1b x1b */
 727     y1          = _mm256_unpacklo_pd(z1, x2);        /* x2c z1c | x2a z1a */
 728     z1          = _mm256_unpackhi_pd(z1, x2);        /* x2d z1d | x2b z1b */
 729     x2          = _mm256_unpacklo_pd(y2, z2);        /* z2c y2c | z2a y2a */
 730     y2          = _mm256_unpackhi_pd(y2, z2);        /* z2d y2d | z2b y2b */
 731     z2          = _mm256_unpacklo_pd(x3, y3);        /* y3c x3c | y3a x3a */
 732     x3          = _mm256_unpackhi_pd(x3, y3);        /* y3d x3d | y3b x3b */
 733     y3          = _mm256_unpacklo_pd(z3, x4);        /* x4c z3c | x4a z3a */
 734     z3          = _mm256_unpackhi_pd(z3, x4);        /* x4d z3d | x4b z3b */
 735     x4          = _mm256_unpacklo_pd(y4, z4);        /* z4c y4c | z4a y4a */
 736     y4          = _mm256_unpackhi_pd(y4, z4);        /* z4d y4d | z4b y4b */
 737
 738     z4          = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
 739     t13         = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
 740     y1          = gmx_mm256_unpack128lo_pd(x1, z1);  /* x2b z1b | y1b x1b */
 741     x1          = gmx_mm256_unpack128hi_pd(x1, z1);  /* x2d z1d | y1d x1d */
 742     z1          = gmx_mm256_unpack128lo_pd(x2, z2);  /* y3a x3a | z2a y2a */
 743     x2          = gmx_mm256_unpack128hi_pd(x2, z2);  /* y3c x3c | z2c y2c */
 744     z2          = gmx_mm256_unpack128lo_pd(y2, x3);  /* y3b x3b | z2b y2b */
 745     y2          = gmx_mm256_unpack128hi_pd(y2, x3);  /* y3d x3d | z2d y2d */
 746     x3          = gmx_mm256_unpack128lo_pd(y3, x4);  /* z4a y4a | x4a z3a */
 747     y3          = gmx_mm256_unpack128hi_pd(y3, x4);  /* z4c y4c | x4c z3c */
 748     x4          = gmx_mm256_unpack128lo_pd(z3, y4);  /* z4b y4b | x4b z3b */
 749     z3          = gmx_mm256_unpack128hi_pd(z3, y4);  /* z4d y4d | x4d z3d */
 750
 751     t1          = _mm256_sub_pd(t1, z4);
 752     t2          = _mm256_sub_pd(t2, y1);
 753     t3          = _mm256_sub_pd(t3, t13);
 754     t4          = _mm256_sub_pd(t4, x1);
 755     t5          = _mm256_sub_pd(t5, z1);
 756     t6          = _mm256_sub_pd(t6, z2);
 757     t7          = _mm256_sub_pd(t7, x2);
 758     t8          = _mm256_sub_pd(t8, y2);
 759     t9          = _mm256_sub_pd(t9, x3);
 760     t10         = _mm256_sub_pd(t10, x4);
 761     t11         = _mm256_sub_pd(t11, y3);
 762     t12         = _mm256_sub_pd(t12, z3);
 763
 764     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 765     _mm256_storeu_pd(ptrA, t1);
 766     _mm256_storeu_pd(ptrB, t2);
 767     _mm256_storeu_pd(ptrC, t3);
 768     _mm256_storeu_pd(ptrD, t4);
 769     _mm256_storeu_pd(ptrA+4, t5);
 770     _mm256_storeu_pd(ptrB+4, t6);
 771     _mm256_storeu_pd(ptrC+4, t7);
 772     _mm256_storeu_pd(ptrD+4, t8);
 773     _mm256_storeu_pd(ptrA+8, t9);
 774     _mm256_storeu_pd(ptrB+8, t10);
 775     _mm256_storeu_pd(ptrC+8, t11);
 776     _mm256_storeu_pd(ptrD+8, t12);
 777 }
 778
 779
 780
 781 static gmx_inline void gmx_simdcall
 782 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 783                                          double * gmx_restrict fptr,
 784                                          double * gmx_restrict fshiftptr)
 785 {
 786     __m256d t1, t2;
 787     __m128d tA, tB;
 788     fix1 = _mm256_hadd_pd(fix1, fiy1);
 789     fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
 790
 791     /* Add across the two lanes */
 792     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
 793     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
 794
 795     fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
 796
 797     t1   = _mm256_loadu_pd(fptr);
 798     t2   = _mm256_loadu_pd(fshiftptr);
 799
 800     t1   = _mm256_add_pd(t1, fix1);
 801     t2   = _mm256_add_pd(t2, fix1);
 802
 803     _mm256_storeu_pd(fptr, t1);
 804     _mm256_storeu_pd(fshiftptr, t2);
 805 }
 806
 807
 808
 809
 810 static gmx_inline void gmx_simdcall
 811 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 812                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 813                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 814                                          double * gmx_restrict fptr,
 815                                          double * gmx_restrict fshiftptr)
 816 {
 817     __m256d t1, t2, t3, t4;
 818     __m128d tz3, tA, tB, tC, tD;
 819
 820     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 821     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 822     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 823     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 824     fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
 825
 826     /* Add across the two lanes by swapping and adding back */
 827     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1);                                       /* X2a-b Z1a-b | Y1a-b X1a-b */
 828     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1);                                       /* X2c-d Z1c-d | Y1c-d X1c-d */
 829     t1   = _mm256_add_pd(t1, t2);                                                      /* x2 z1 | y1 x1 */
 830
 831     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3);                                       /* Y3a-b X3a-b | Z2a-b Y2a-b */
 832     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3);                                       /* Y3c-d X3c-d | Z2c-d Y2c-d */
 833     t3   = _mm256_add_pd(t3, t4);                                                      /* y3 x3 | z2 y2 */
 834
 835     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
 836
 837     t2   = _mm256_loadu_pd(fptr);
 838     t4   = _mm256_loadu_pd(fptr+4);
 839     tA   = _mm_load_sd(fptr+8);
 840
 841     t2   = _mm256_add_pd(t2, t1);
 842     t4   = _mm256_add_pd(t4, t3);
 843     tA   = _mm_add_sd(tA, tz3);
 844
 845     _mm256_storeu_pd(fptr, t2);
 846     _mm256_storeu_pd(fptr+4, t4);
 847     _mm_store_sd(fptr+8, tA);
 848
 849     /* Add up shift force */
 850     /* t1:   x2 z1 | y1 x1 */
 851     /* t3:   y3 x3 | z2 y2 */
 852     /* tz3:           0 z3 */
 853
 854     /* z component */
 855     tB   = _mm256_extractf128_pd(t1, 0x1);                                     /* x2 z1 */
 856     tC   = _mm256_extractf128_pd(t3, 0x1);                                     /* y3 x3 */
 857     tz3  = _mm_add_sd(tz3, tB);                                                /* 0  z1+z3 */
 858     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
 859     tz3  = _mm_add_sd(tz3, tD);                                                /* - z */
 860
 861     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t1));                         /* y1+y3 x1+x3 */
 862
 863     tD   = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
 864     tC   = _mm_add_pd(tC, tD);                                                 /* y x */
 865
 866     tA   = _mm_loadu_pd(fshiftptr);
 867     tB   = _mm_load_sd(fshiftptr+2);
 868     tA   = _mm_add_pd(tA, tC);
 869     tB   = _mm_add_sd(tB, tz3);
 870     _mm_storeu_pd(fshiftptr, tA);
 871     _mm_store_sd(fshiftptr+2, tB);
 872 }
 873
 874
 875 static gmx_inline void gmx_simdcall
 876 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 877                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 878                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 879                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
 880                                          double * gmx_restrict fptr,
 881                                          double * gmx_restrict fshiftptr)
 882 {
 883     __m256d t1, t2, t3, t4, t5, t6;
 884     __m128d tA, tB, tC, tD;
 885
 886     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 887     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 888     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 889     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 890     fiz3 = _mm256_hadd_pd(fiz3, fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
 891     fiy4 = _mm256_hadd_pd(fiy4, fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
 892
 893     /* Add across the two lanes by swapping and adding back */
 894     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
 895     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
 896     t1   = _mm256_add_pd(t1, t2);                /* x2 z1 | y1 x1 */
 897
 898     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
 899     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
 900     t3   = _mm256_add_pd(t3, t4);                /* y3 x3 | z2 y2 */
 901
 902     t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
 903     t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
 904     t5   = _mm256_add_pd(t5, t6);                /* z4 y4 | x4 z3 */
 905
 906     t2   = _mm256_loadu_pd(fptr);
 907     t4   = _mm256_loadu_pd(fptr+4);
 908     t6   = _mm256_loadu_pd(fptr+8);
 909
 910     t2   = _mm256_add_pd(t2, t1);
 911     t4   = _mm256_add_pd(t4, t3);
 912     t6   = _mm256_add_pd(t6, t5);
 913
 914     _mm256_storeu_pd(fptr, t2);
 915     _mm256_storeu_pd(fptr+4, t4);
 916     _mm256_storeu_pd(fptr+8, t6);
 917
 918     /* Add up shift force  */
 919     /* t1:   x2. z1. | y1. x1. */
 920     /* t3:   y3. x3. | z2 y2 */
 921     /* t5:   z4 y4 | x4. z3. */
 922
 923     /* z component */
 924     tA   = _mm256_extractf128_pd(t1, 0x1);                /* x2 z1 */
 925     tB   = _mm256_extractf128_pd(t3, 0x1);                /* y3 x3 */
 926     tC   = _mm256_extractf128_pd(t5, 0x1);                /* z4 y4 */
 927
 928     tB   = _mm_add_pd(tB, _mm256_castpd256_pd128(t1));    /*  y1+y3  x1+x3 */
 929     tA   = _mm_add_pd(tA, _mm256_castpd256_pd128(t5));    /*  x2+x4  z1+z3 */
 930     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t3));    /*  z4+z2  y4+y2 */
 931
 932     tD   = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1));    /* y4+y2 x2+x4 */
 933     tB   = _mm_add_pd(tB, tD);                            /* y x */
 934     tC   = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /*    - z4+z2 */
 935     tC   = _mm_add_sd(tC, tA);                            /* - z */
 936
 937     tA   = _mm_loadu_pd(fshiftptr);
 938     tD   = _mm_load_sd(fshiftptr+2);
 939     tA   = _mm_add_pd(tA, tB);
 940     tD   = _mm_add_sd(tD, tC);
 941     _mm_storeu_pd(fshiftptr, tA);
 942     _mm_store_sd(fshiftptr+2, tD);
 943 }
 944
 945
 946 static gmx_inline void gmx_simdcall
 947 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
 948 {
 949     __m128d t1;
 950
 951     pot1 = _mm256_hadd_pd(pot1, pot1);
 952
 953     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
 954
 955     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
 956 }
 957
 958 static gmx_inline void gmx_simdcall
 959 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
 960                          __m256d pot2, double * gmx_restrict ptrB)
 961 {
 962     __m128d t1, t2;
 963
 964     pot1 = _mm256_hadd_pd(pot1, pot2);
 965
 966     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
 967
 968     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 969     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
 970     _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
 971 }
 972
 973
 974 #endif /* _kernelutil_x86_avx_256_double_h_ */