src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_256_double_h_
  36 #define _kernelutil_x86_avx_256_double_h_
  37
  38 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  39
  40 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
  41 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  42 #define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
  43 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
  44 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
  45     {                                                        \
  46         __m256d _t0, _t1, _t2, _t3;                          \
  47         _t0  = _mm256_unpacklo_pd((row0), (row1));           \
  48         _t1  = _mm256_unpackhi_pd((row0), (row1));           \
  49         _t2  = _mm256_unpacklo_pd((row2), (row3));           \
  50         _t3  = _mm256_unpackhi_pd((row2), (row3));           \
  51         row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
  52         row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
  53         row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
  54         row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
  55     }
  56
  57 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
  58
  59 static __m256d
  60 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
  61 {
  62     return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
  63 }
  64
  65 static __m256d
  66 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
  67 {
  68     return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
  69 }
  70
  71 static __m256d
  72 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
  73 {
  74     return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
  75 }
  76
  77 static gmx_inline __m256
  78 gmx_mm256_set_m128(__m128 hi, __m128 lo)
  79 {
  80     return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
  81 }
  82
  83 static int
  84 gmx_mm256_any_lt(__m256d a, __m256d b)
  85 {
  86     return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  87 }
  88
  89 static gmx_inline __m256d
  90 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  91 {
  92     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
  93 }
  94
  95 /* Normal sum of four ymm registers */
  96 #define gmx_mm256_sum4_pd(t0, t1, t2, t3)  _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
  97
  98
  99 /* Load a single value from 1-4 places, merge into xmm register */
 100 static __m256d
 101 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
 102 {
 103     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
 104 }
 105
 106 static __m256d
 107 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
 108                                 const double * gmx_restrict ptrB)
 109 {
 110     __m128d tA, tB;
 111
 112     tA = _mm_load_sd(ptrA);
 113     tB = _mm_load_sd(ptrB);
 114
 115     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
 116 }
 117
 118
 119 static __m256d
 120 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 121                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
 122 {
 123     __m128d t1, t2;
 124
 125     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
 126     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
 127     return gmx_mm256_set_m128d(t2, t1);
 128 }
 129
 130
 131
 132 static void
 133 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 134 {
 135     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 136 }
 137
 138
 139 static void
 140 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 141 {
 142     __m256d t2;
 143
 144     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 145     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 146     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 147 }
 148
 149
 150
 151
 152 static void
 153 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 154                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 155 {
 156     __m256d t2;
 157     __m128d t3, t4;
 158
 159     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 160     t3       = _mm256_extractf128_pd(xmm1, 0x1);
 161     t4       = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 162     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 163     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 164     _mm_store_sd(ptrC, t3);
 165     _mm_store_sd(ptrD, t4);
 166 }
 167
 168
 169
 170
 171 static void
 172 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 173 {
 174     __m128d t1;
 175
 176     t1   = _mm256_castpd256_pd128(xmm1);
 177     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 178
 179     _mm_store_sd(ptrA, t1);
 180 }
 181
 182
 183 static void
 184 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 185 {
 186     __m128d t1, t2;
 187
 188     t1   = _mm256_castpd256_pd128(xmm1);
 189     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 190
 191     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 192     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 193
 194     _mm_store_sd(ptrA, t1);
 195     _mm_store_sd(ptrB, t2);
 196 }
 197
 198
 199 static void
 200 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 201                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 202 {
 203     __m128d t1, t2, t3, t4;
 204
 205     t1   = _mm256_castpd256_pd128(xmm1);
 206     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 207     t3   = _mm256_extractf128_pd(xmm1, 0x1);
 208     t4   = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 209
 210     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 211     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 212     t3   = _mm_add_sd(t3, _mm_load_sd(ptrC));
 213     t4   = _mm_add_sd(t4, _mm_load_sd(ptrD));
 214
 215     _mm_store_sd(ptrA, t1);
 216     _mm_store_sd(ptrB, t2);
 217     _mm_store_sd(ptrC, t3);
 218     _mm_store_sd(ptrD, t4);
 219 }
 220
 221
 222
 223 static void
 224 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 225 {
 226     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 227     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 228 }
 229
 230
 231 static void
 232 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 233 {
 234     __m128d t1, t2, t3;
 235
 236     t1   = _mm_loadu_pd(p1);
 237     t2   = _mm_loadu_pd(p2);
 238     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
 239     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
 240 }
 241
 242
 243
 244 static void
 245 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 246                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 247                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 248 {
 249     __m256d t1, t2;
 250
 251     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 252     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 253
 254     *c6  = _mm256_unpacklo_pd(t1, t2);                              /* c6d c6c | c6b c6a */
 255     *c12 = _mm256_unpackhi_pd(t1, t2);                              /* c12d c12c | c12b c12a */
 256 }
 257
 258
 259 static gmx_inline void
 260 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 261                                             const double * gmx_restrict xyz,
 262                                             __m256d * gmx_restrict      x1,
 263                                             __m256d * gmx_restrict      y1,
 264                                             __m256d * gmx_restrict      z1)
 265 {
 266     __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
 267
 268     mem_xy  = _mm_loadu_pd(xyz);
 269     mem_z   = _mm_load_sd(xyz+2);
 270     mem_sxy = _mm_loadu_pd(xyz_shift);
 271     mem_sz  = _mm_load_sd(xyz_shift+2);
 272
 273     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 274     mem_z   = _mm_add_pd(mem_z, mem_sz);
 275
 276     tx  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 277     ty  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 278     tz  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 279
 280     *x1 = gmx_mm256_set_m128d(tx, tx);
 281     *y1 = gmx_mm256_set_m128d(ty, ty);
 282     *z1 = gmx_mm256_set_m128d(tz, tz);
 283 }
 284
 285
 286 static gmx_inline void
 287 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 288                                             const double * gmx_restrict xyz,
 289                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 290                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 291                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 292 {
 293     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
 294
 295     t1  = _mm_loadu_pd(xyz);
 296     t2  = _mm_loadu_pd(xyz+2);
 297     t3  = _mm_loadu_pd(xyz+4);
 298     t4  = _mm_loadu_pd(xyz+6);
 299     t5  = _mm_load_sd(xyz+8);
 300
 301     sxy = _mm_loadu_pd(xyz_shift);
 302     sz  = _mm_load_sd(xyz_shift+2);
 303     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 304     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 305
 306     t1  = _mm_add_pd(t1, sxy);
 307     t2  = _mm_add_pd(t2, szx);
 308     t3  = _mm_add_pd(t3, syz);
 309     t4  = _mm_add_pd(t4, sxy);
 310     t5  = _mm_add_sd(t5, sz);
 311
 312     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 313     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 314     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 315     *x1  = gmx_mm256_set_m128d(tx, tx);
 316     *y1  = gmx_mm256_set_m128d(ty, ty);
 317     *z1  = gmx_mm256_set_m128d(tz, tz);
 318     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 319     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 320     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 321     *x2  = gmx_mm256_set_m128d(tx, tx);
 322     *y2  = gmx_mm256_set_m128d(ty, ty);
 323     *z2  = gmx_mm256_set_m128d(tz, tz);
 324     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 325     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 326     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 327     *x3  = gmx_mm256_set_m128d(tx, tx);
 328     *y3  = gmx_mm256_set_m128d(ty, ty);
 329     *z3  = gmx_mm256_set_m128d(tz, tz);
 330 }
 331
 332
 333 static gmx_inline void
 334 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 335                                             const double * gmx_restrict xyz,
 336                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 337                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 338                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 339                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 340 {
 341     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
 342
 343     t1  = _mm_loadu_pd(xyz);
 344     t2  = _mm_loadu_pd(xyz+2);
 345     t3  = _mm_loadu_pd(xyz+4);
 346     t4  = _mm_loadu_pd(xyz+6);
 347     t5  = _mm_loadu_pd(xyz+8);
 348     t6  = _mm_loadu_pd(xyz+10);
 349
 350     sxy = _mm_loadu_pd(xyz_shift);
 351     sz  = _mm_load_sd(xyz_shift+2);
 352     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 353     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 354
 355     t1  = _mm_add_pd(t1, sxy);
 356     t2  = _mm_add_pd(t2, szx);
 357     t3  = _mm_add_pd(t3, syz);
 358     t4  = _mm_add_pd(t4, sxy);
 359     t5  = _mm_add_pd(t5, szx);
 360     t6  = _mm_add_pd(t6, syz);
 361
 362     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 363     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 364     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 365     *x1  = gmx_mm256_set_m128d(tx, tx);
 366     *y1  = gmx_mm256_set_m128d(ty, ty);
 367     *z1  = gmx_mm256_set_m128d(tz, tz);
 368     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 369     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 370     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 371     *x2  = gmx_mm256_set_m128d(tx, tx);
 372     *y2  = gmx_mm256_set_m128d(ty, ty);
 373     *z2  = gmx_mm256_set_m128d(tz, tz);
 374     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 375     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 376     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 377     *x3  = gmx_mm256_set_m128d(tx, tx);
 378     *y3  = gmx_mm256_set_m128d(ty, ty);
 379     *z3  = gmx_mm256_set_m128d(tz, tz);
 380     tx   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 381     ty   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 382     tz   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 383     *x4  = gmx_mm256_set_m128d(tx, tx);
 384     *y4  = gmx_mm256_set_m128d(ty, ty);
 385     *z4  = gmx_mm256_set_m128d(tz, tz);
 386 }
 387
 388
 389 static void
 390 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 391                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 392 {
 393     __m256d t1;
 394
 395     t1            = _mm256_loadu_pd(p1);
 396     *x            = t1;
 397     *y            = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 398     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 399 }
 400
 401
 402 static void
 403 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 404                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 405                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 406                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 407 {
 408     __m256d t1, t2, t3, t4;
 409
 410     t1            = _mm256_loadu_pd(p1);
 411     t3            = _mm256_loadu_pd(p1+4);
 412     *x1           = t1;
 413     *y2           = t3;
 414     t2            = gmx_mm256_unpack128hi_pd(t1, t1);
 415     t4            = gmx_mm256_unpack128hi_pd(t3, t3);
 416     *z1           = t2;
 417     *x3           = t4;
 418     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 419     *z2           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 420     *x2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 421     *y3           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 422     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 423 }
 424
 425 static void
 426 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 427                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 428                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 429                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 430                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 431 {
 432     __m256d t1, t2, t3, t4, t5, t6;
 433
 434     t1            = _mm256_loadu_pd(p1);
 435     t2            = _mm256_loadu_pd(p1+4);
 436     t3            = _mm256_loadu_pd(p1+8);
 437
 438     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 439     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
 440     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
 441
 442     *x1           = t1;
 443     *y2           = t2;
 444     *z3           = t3;
 445     *z1           = t4;
 446     *x3           = t5;
 447     *y4           = t6;
 448
 449     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 450     *z2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 451     *x4           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 452     *x2           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 453     *y3           = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 454     *z4           = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 455 }
 456
 457
 458 static void
 459 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 460                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 461                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 462 {
 463     __m256d t1, t2, t3, t4, t5, t6;
 464
 465     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 466     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 467     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 468     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 469
 470     t5           = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 471     t6           = _mm256_unpackhi_pd(t1, t2);   /*   -   -  | y1b y1a */
 472     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1c z1c | x1d x1c */
 473     t2           = _mm256_unpackhi_pd(t3, t4);   /*   -   -  | y1d y1c */
 474
 475     *x1          = gmx_mm256_unpack128lo_pd(t5, t1);
 476     *y1          = gmx_mm256_unpack128lo_pd(t6, t2);
 477     *z1          = gmx_mm256_unpack128hi_pd(t5, t1);
 478 }
 479
 480
 481
 482 static void
 483 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 484                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 485                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 486                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 487                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 488 {
 489     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 490
 491     t1           = _mm256_loadu_pd(ptrA);                       /*  x2a z1a | y1a x1a */
 492     t2           = _mm256_loadu_pd(ptrB);                       /*  x2b z1b | y1b x1b */
 493     t3           = _mm256_loadu_pd(ptrC);                       /*  x2c z1c | y1c x1c */
 494     t4           = _mm256_loadu_pd(ptrD);                       /*  x2d z1d | y1d x1d */
 495     t5           = _mm256_loadu_pd(ptrA+4);                     /*  y3a x3a | z2a y2a */
 496     t6           = _mm256_loadu_pd(ptrB+4);                     /*  y3b x3b | z2b y2b */
 497     t7           = _mm256_loadu_pd(ptrC+4);                     /*  y3c x3c | z2c y2c */
 498     t8           = _mm256_loadu_pd(ptrD+4);                     /*  y3d x3d | z2d y2d */
 499     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /*   -   -  |  -  z3a */
 500     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /*   -   -  |  -  z3b */
 501     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /*   -   -  |  -  z3c */
 502     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /*   -   -  |  -  z3d */
 503
 504     t13          = _mm256_unpacklo_pd(t1, t2);                  /*  z1b z1a | x1b x1a */
 505     t14          = _mm256_unpackhi_pd(t1, t2);                  /*  x2b x2a | y1b y1a */
 506     t1           = _mm256_unpacklo_pd(t3, t4);                  /*  z1d z1c | x1d x1c */
 507     t2           = _mm256_unpackhi_pd(t3, t4);                  /*  x2d x2c | y1d y1c */
 508
 509     t3           = _mm256_unpacklo_pd(t5, t6);                  /*  x3b x3a | y2b y2a */
 510     t4           = _mm256_unpackhi_pd(t5, t6);                  /*  y3b y3a | z2b z2a */
 511     t5           = _mm256_unpacklo_pd(t7, t8);                  /*  x3d x3c | y2d y2c */
 512     t6           = _mm256_unpackhi_pd(t7, t8);                  /*  y3d y3c | z2d z2c */
 513
 514     t9           = _mm256_unpacklo_pd(t9, t10);                 /*   -   -  | z3b z3a */
 515     t11          = _mm256_unpacklo_pd(t11, t12);                /*   -   -  | z3d z3c */
 516
 517     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 518     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 519     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 520     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 521     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 522     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 523     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 524     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 525     *z3          = gmx_mm256_unpack128lo_pd(t9, t11);
 526 }
 527
 528
 529
 530 static void
 531 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 532                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 533                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 534                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 535                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 536                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 537 {
 538     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 539
 540     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 541     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 542     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 543     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 544     t5           = _mm256_loadu_pd(ptrA+4);      /*  y3a x3a | z2a y2a */
 545     t6           = _mm256_loadu_pd(ptrB+4);      /*  y3b x3b | z2b y2b */
 546     t7           = _mm256_loadu_pd(ptrC+4);      /*  y3c x3c | z2c y2c */
 547     t8           = _mm256_loadu_pd(ptrD+4);      /*  y3d x3d | z2d y2d */
 548     t9           = _mm256_loadu_pd(ptrA+8);      /*  z4a y4a | x4a z3a */
 549     t10          = _mm256_loadu_pd(ptrB+8);      /*  z4b y4b | x4b z3b */
 550     t11          = _mm256_loadu_pd(ptrC+8);      /*  z4c y4c | x4c z3c */
 551     t12          = _mm256_loadu_pd(ptrD+8);      /*  z4d y4d | x4d z3d */
 552
 553     t13          = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 554     t14          = _mm256_unpackhi_pd(t1, t2);   /*  x2b x2a | y1b y1a */
 555     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1d z1c | x1d x1c */
 556     t2           = _mm256_unpackhi_pd(t3, t4);   /*  x2d x2c | y1d y1c */
 557
 558     t3           = _mm256_unpacklo_pd(t5, t6);   /*  x3b x3a | y2b y2a */
 559     t4           = _mm256_unpackhi_pd(t5, t6);   /*  y3b y3a | z2b z2a */
 560     t5           = _mm256_unpacklo_pd(t7, t8);   /*  x3d x3c | y2d y2c */
 561     t6           = _mm256_unpackhi_pd(t7, t8);   /*  y3d y3c | z2d z2c */
 562
 563     t7           = _mm256_unpacklo_pd(t9, t10);  /*  y4b y4a | z3b z3a */
 564     t8           = _mm256_unpackhi_pd(t9, t10);  /*  z4b z4a | x4b x4a */
 565     t9           = _mm256_unpacklo_pd(t11, t12); /*  y4d y4c | z3d z3c */
 566     t10          = _mm256_unpackhi_pd(t11, t12); /*  z4d z4c | x4d x4c */
 567
 568     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 569     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 570     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 571     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 572     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 573     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 574     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 575     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 576     *z3          = gmx_mm256_unpack128lo_pd(t7, t9);
 577     *x4          = gmx_mm256_unpack128lo_pd(t8, t10);
 578     *y4          = gmx_mm256_unpack128hi_pd(t7, t9);
 579     *z4          = gmx_mm256_unpack128hi_pd(t8, t10);
 580 }
 581
 582
 583
 584 static void
 585 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 586                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 587                                           __m256d x1, __m256d y1, __m256d z1)
 588 {
 589     __m256d t1, t2, tA, tB, tC, tD;
 590     __m256i mask;
 591
 592     t1          = _mm256_unpacklo_pd(x1, y1);       /*  y1c x1c | y1a x1a */
 593     t2          = _mm256_unpackhi_pd(x1, y1);       /*  y1d x1d | y1b x1b */
 594     x1          = gmx_mm256_unpack128lo_pd(t1, z1); /*  -  z1a | y1a x1a */
 595     y1          = gmx_mm256_unpack128hi_pd(t1, z1); /*  -  z1c | y1c x1c */
 596     z1          = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 597     t1          = gmx_mm256_unpack128lo_pd(t2, z1); /*  -  z1b | y1b x1b */
 598     z1          = gmx_mm256_unpack128hi_pd(t2, z1); /*  -  z1d | y1d x1d */
 599
 600     /* Construct a mask without executing any data loads */
 601     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 602                                                       _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
 603
 604     tA          = _mm256_loadu_pd(ptrA);
 605     tB          = _mm256_loadu_pd(ptrB);
 606     tC          = _mm256_loadu_pd(ptrC);
 607     tD          = _mm256_loadu_pd(ptrD);
 608
 609     tA          = _mm256_sub_pd(tA, x1);
 610     tB          = _mm256_sub_pd(tB, t1);
 611     tC          = _mm256_sub_pd(tC, y1);
 612     tD          = _mm256_sub_pd(tD, z1);
 613
 614     _mm256_maskstore_pd(ptrA, mask, tA);
 615     _mm256_maskstore_pd(ptrB, mask, tB);
 616     _mm256_maskstore_pd(ptrC, mask, tC);
 617     _mm256_maskstore_pd(ptrD, mask, tD);
 618 }
 619
 620
 621
 622 #if defined (_MSC_VER) && defined(_M_IX86)
 623 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 624 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 625                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 626     { \
 627         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
 628         __m128d _tA, _tB, _tC, _tD, _tE; \
 629         _t1          = _mm256_loadu_pd(ptrA); \
 630         _t2          = _mm256_loadu_pd(ptrB); \
 631         _t3          = _mm256_loadu_pd(ptrC); \
 632         _t4          = _mm256_loadu_pd(ptrD); \
 633         _t5          = _mm256_loadu_pd(ptrA+4); \
 634         _t6          = _mm256_loadu_pd(ptrB+4); \
 635         _t7          = _mm256_loadu_pd(ptrC+4); \
 636         _t8          = _mm256_loadu_pd(ptrD+4); \
 637         _tA          = _mm_load_sd(ptrA+8); \
 638         _tB          = _mm_load_sd(ptrB+8); \
 639         _tC          = _mm_load_sd(ptrC+8); \
 640         _tD          = _mm_load_sd(ptrD+8); \
 641         _t9          = _mm256_unpacklo_pd(_x1, _y1); \
 642         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 643         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 644         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 645         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 646         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 647         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 648         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 649         _t10         = gmx_mm256_unpack128lo_pd(_t9, _y1); \
 650         _y3          = gmx_mm256_unpack128hi_pd(_t9, _y1); \
 651         _t9          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 652         _y1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 653         _x1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 654         _z1          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 655         _x2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 656         _z2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 657         _t1          = _mm256_sub_pd(_t1, _t10); \
 658         _t2          = _mm256_sub_pd(_t2, _t9); \
 659         _t3          = _mm256_sub_pd(_t3, _y3); \
 660         _t4          = _mm256_sub_pd(_t4, _y1); \
 661         _t5          = _mm256_sub_pd(_t5, _x1); \
 662         _t6          = _mm256_sub_pd(_t6, _x2); \
 663         _t7          = _mm256_sub_pd(_t7, _z1); \
 664         _t8          = _mm256_sub_pd(_t8, _z2); \
 665         _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
 666         _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
 667         _tE          = _mm256_extractf128_pd(_z3, 0x1); \
 668         _tC          = _mm_sub_sd(_tC, _tE); \
 669         _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
 670         _mm256_storeu_pd(ptrA, _t1); \
 671         _mm256_storeu_pd(ptrB, _t2); \
 672         _mm256_storeu_pd(ptrC, _t3); \
 673         _mm256_storeu_pd(ptrD, _t4); \
 674         _mm256_storeu_pd(ptrA+4, _t5); \
 675         _mm256_storeu_pd(ptrB+4, _t6); \
 676         _mm256_storeu_pd(ptrC+4, _t7); \
 677         _mm256_storeu_pd(ptrD+4, _t8); \
 678         _mm_store_sd(ptrA+8, _tA); \
 679         _mm_store_sd(ptrB+8, _tB); \
 680         _mm_store_sd(ptrC+8, _tC); \
 681         _mm_store_sd(ptrD+8, _tD); \
 682     }
 683 #else
 684 /* Real function for sane compilers */
 685 static void
 686 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 687                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 688                                           __m256d x1, __m256d y1, __m256d z1,
 689                                           __m256d x2, __m256d y2, __m256d z2,
 690                                           __m256d x3, __m256d y3, __m256d z3)
 691 {
 692     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 693     __m128d tA, tB, tC, tD, tE;
 694
 695     t1          = _mm256_loadu_pd(ptrA);
 696     t2          = _mm256_loadu_pd(ptrB);
 697     t3          = _mm256_loadu_pd(ptrC);
 698     t4          = _mm256_loadu_pd(ptrD);
 699     t5          = _mm256_loadu_pd(ptrA+4);
 700     t6          = _mm256_loadu_pd(ptrB+4);
 701     t7          = _mm256_loadu_pd(ptrC+4);
 702     t8          = _mm256_loadu_pd(ptrD+4);
 703     tA          = _mm_load_sd(ptrA+8);
 704     tB          = _mm_load_sd(ptrB+8);
 705     tC          = _mm_load_sd(ptrC+8);
 706     tD          = _mm_load_sd(ptrD+8);
 707
 708     t9          = _mm256_unpacklo_pd(x1, y1);       /* y1c x1c | y1a x1a */
 709     x1          = _mm256_unpackhi_pd(x1, y1);       /* y1d x1d | y1b x1b */
 710
 711     y1          = _mm256_unpacklo_pd(z1, x2);       /* x2c z1c | x2a z1a */
 712     z1          = _mm256_unpackhi_pd(z1, x2);       /* x2d z1d | x2b z1b */
 713
 714     x2          = _mm256_unpacklo_pd(y2, z2);       /* z2c y2c | z2a y2a */
 715     y2          = _mm256_unpackhi_pd(y2, z2);       /* z2d y2d | z2b y2b */
 716
 717     z2          = _mm256_unpacklo_pd(x3, y3);       /* y3c x3c | y3a x3a */
 718     x3          = _mm256_unpackhi_pd(x3, y3);       /* y3d x3d | y3b x3b */
 719
 720     t10         = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
 721     y3          = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
 722
 723     t9          = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
 724     y1          = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
 725
 726     x1          = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
 727     z1          = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
 728
 729     x2          = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
 730     z2          = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
 731
 732     t1          = _mm256_sub_pd(t1, t10);
 733     t2          = _mm256_sub_pd(t2, t9);
 734     t3          = _mm256_sub_pd(t3, y3);
 735     t4          = _mm256_sub_pd(t4, y1);
 736     t5          = _mm256_sub_pd(t5, x1);
 737     t6          = _mm256_sub_pd(t6, x2);
 738     t7          = _mm256_sub_pd(t7, z1);
 739     t8          = _mm256_sub_pd(t8, z2);
 740
 741     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 742     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
 743     tE          = _mm256_extractf128_pd(z3, 0x1);
 744     tC          = _mm_sub_sd(tC, tE);
 745     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
 746
 747     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 748     _mm256_storeu_pd(ptrA, t1);
 749     _mm256_storeu_pd(ptrB, t2);
 750     _mm256_storeu_pd(ptrC, t3);
 751     _mm256_storeu_pd(ptrD, t4);
 752     _mm256_storeu_pd(ptrA+4, t5);
 753     _mm256_storeu_pd(ptrB+4, t6);
 754     _mm256_storeu_pd(ptrC+4, t7);
 755     _mm256_storeu_pd(ptrD+4, t8);
 756     _mm_store_sd(ptrA+8, tA);
 757     _mm_store_sd(ptrB+8, tB);
 758     _mm_store_sd(ptrC+8, tC);
 759     _mm_store_sd(ptrD+8, tD);
 760 }
 761 #endif
 762
 763 #if defined (_MSC_VER) && defined(_M_IX86)
 764 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 765 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 766                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
 767     { \
 768         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
 769         __m128d _tA, _tB, _tC, _tD, _tE; \
 770         _t1          = _mm256_loadu_pd(ptrA); \
 771         _t2          = _mm256_loadu_pd(ptrB); \
 772         _t3          = _mm256_loadu_pd(ptrC); \
 773         _t4          = _mm256_loadu_pd(ptrD); \
 774         _t5          = _mm256_loadu_pd(ptrA+4); \
 775         _t6          = _mm256_loadu_pd(ptrB+4); \
 776         _t7          = _mm256_loadu_pd(ptrC+4); \
 777         _t8          = _mm256_loadu_pd(ptrD+4); \
 778         _t9          = _mm256_loadu_pd(ptrA+8); \
 779         _t10         = _mm256_loadu_pd(ptrB+8); \
 780         _t11         = _mm256_loadu_pd(ptrC+8); \
 781         _t12         = _mm256_loadu_pd(ptrD+8); \
 782         _t13         = _mm256_unpacklo_pd(_x1, _y1); \
 783         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 784         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 785         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 786         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 787         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 788         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 789         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 790         _y3          = _mm256_unpacklo_pd(_z3, _x4); \
 791         _z3          = _mm256_unpackhi_pd(_z3, _x4); \
 792         _x4          = _mm256_unpacklo_pd(_y4, _z4); \
 793         _y4          = _mm256_unpackhi_pd(_y4, _z4); \
 794         _z4          = gmx_mm256_unpack128lo_pd(_t13, _y1); \
 795         _t13         = gmx_mm256_unpack128hi_pd(_t13, _y1); \
 796         _y1          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 797         _x1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 798         _z1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 799         _x2          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 800         _z2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 801         _y2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 802         _x3          = gmx_mm256_unpack128lo_pd(_y3, _x4); \
 803         _y3          = gmx_mm256_unpack128hi_pd(_y3, _x4); \
 804         _x4          = gmx_mm256_unpack128lo_pd(_z3, _y4); \
 805         _z3          = gmx_mm256_unpack128hi_pd(_z3, _y4); \
 806         _t1          = _mm256_sub_pd(_t1, _z4); \
 807         _t2          = _mm256_sub_pd(_t2, _y1); \
 808         _t3          = _mm256_sub_pd(_t3, _t13); \
 809         _t4          = _mm256_sub_pd(_t4, _x1); \
 810         _t5          = _mm256_sub_pd(_t5, _z1); \
 811         _t6          = _mm256_sub_pd(_t6, _z2); \
 812         _t7          = _mm256_sub_pd(_t7, _x2); \
 813         _t8          = _mm256_sub_pd(_t8, _y2); \
 814         _t9          = _mm256_sub_pd(_t9, _x3); \
 815         _t10         = _mm256_sub_pd(_t10, _x4); \
 816         _t11         = _mm256_sub_pd(_t11, _y3); \
 817         _t12         = _mm256_sub_pd(_t12, _z3); \
 818         _mm256_storeu_pd(ptrA, _t1); \
 819         _mm256_storeu_pd(ptrB, _t2); \
 820         _mm256_storeu_pd(ptrC, _t3); \
 821         _mm256_storeu_pd(ptrD, _t4); \
 822         _mm256_storeu_pd(ptrA+4, _t5); \
 823         _mm256_storeu_pd(ptrB+4, _t6); \
 824         _mm256_storeu_pd(ptrC+4, _t7); \
 825         _mm256_storeu_pd(ptrD+4, _t8); \
 826         _mm256_storeu_pd(ptrA+8, _t9); \
 827         _mm256_storeu_pd(ptrB+8, _t10); \
 828         _mm256_storeu_pd(ptrC+8, _t11); \
 829         _mm256_storeu_pd(ptrD+8, _t12); \
 830     }
 831 #else
 832 /* Real function for sane compilers */
 833 static void
 834 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 835                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 836                                           __m256d x1, __m256d y1, __m256d z1,
 837                                           __m256d x2, __m256d y2, __m256d z2,
 838                                           __m256d x3, __m256d y3, __m256d z3,
 839                                           __m256d x4, __m256d y4, __m256d z4)
 840 {
 841     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 842     __m128d tA, tB, tC, tD, tE;
 843
 844     t1          = _mm256_loadu_pd(ptrA);
 845     t2          = _mm256_loadu_pd(ptrB);
 846     t3          = _mm256_loadu_pd(ptrC);
 847     t4          = _mm256_loadu_pd(ptrD);
 848     t5          = _mm256_loadu_pd(ptrA+4);
 849     t6          = _mm256_loadu_pd(ptrB+4);
 850     t7          = _mm256_loadu_pd(ptrC+4);
 851     t8          = _mm256_loadu_pd(ptrD+4);
 852     t9          = _mm256_loadu_pd(ptrA+8);
 853     t10         = _mm256_loadu_pd(ptrB+8);
 854     t11         = _mm256_loadu_pd(ptrC+8);
 855     t12         = _mm256_loadu_pd(ptrD+8);
 856
 857     t13         = _mm256_unpacklo_pd(x1, y1);        /* y1c x1c | y1a x1a */
 858     x1          = _mm256_unpackhi_pd(x1, y1);        /* y1d x1d | y1b x1b */
 859     y1          = _mm256_unpacklo_pd(z1, x2);        /* x2c z1c | x2a z1a */
 860     z1          = _mm256_unpackhi_pd(z1, x2);        /* x2d z1d | x2b z1b */
 861     x2          = _mm256_unpacklo_pd(y2, z2);        /* z2c y2c | z2a y2a */
 862     y2          = _mm256_unpackhi_pd(y2, z2);        /* z2d y2d | z2b y2b */
 863     z2          = _mm256_unpacklo_pd(x3, y3);        /* y3c x3c | y3a x3a */
 864     x3          = _mm256_unpackhi_pd(x3, y3);        /* y3d x3d | y3b x3b */
 865     y3          = _mm256_unpacklo_pd(z3, x4);        /* x4c z3c | x4a z3a */
 866     z3          = _mm256_unpackhi_pd(z3, x4);        /* x4d z3d | x4b z3b */
 867     x4          = _mm256_unpacklo_pd(y4, z4);        /* z4c y4c | z4a y4a */
 868     y4          = _mm256_unpackhi_pd(y4, z4);        /* z4d y4d | z4b y4b */
 869
 870     z4          = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
 871     t13         = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
 872     y1          = gmx_mm256_unpack128lo_pd(x1, z1);  /* x2b z1b | y1b x1b */
 873     x1          = gmx_mm256_unpack128hi_pd(x1, z1);  /* x2d z1d | y1d x1d */
 874     z1          = gmx_mm256_unpack128lo_pd(x2, z2);  /* y3a x3a | z2a y2a */
 875     x2          = gmx_mm256_unpack128hi_pd(x2, z2);  /* y3c x3c | z2c y2c */
 876     z2          = gmx_mm256_unpack128lo_pd(y2, x3);  /* y3b x3b | z2b y2b */
 877     y2          = gmx_mm256_unpack128hi_pd(y2, x3);  /* y3d x3d | z2d y2d */
 878     x3          = gmx_mm256_unpack128lo_pd(y3, x4);  /* z4a y4a | x4a z3a */
 879     y3          = gmx_mm256_unpack128hi_pd(y3, x4);  /* z4c y4c | x4c z3c */
 880     x4          = gmx_mm256_unpack128lo_pd(z3, y4);  /* z4b y4b | x4b z3b */
 881     z3          = gmx_mm256_unpack128hi_pd(z3, y4);  /* z4d y4d | x4d z3d */
 882
 883     t1          = _mm256_sub_pd(t1, z4);
 884     t2          = _mm256_sub_pd(t2, y1);
 885     t3          = _mm256_sub_pd(t3, t13);
 886     t4          = _mm256_sub_pd(t4, x1);
 887     t5          = _mm256_sub_pd(t5, z1);
 888     t6          = _mm256_sub_pd(t6, z2);
 889     t7          = _mm256_sub_pd(t7, x2);
 890     t8          = _mm256_sub_pd(t8, y2);
 891     t9          = _mm256_sub_pd(t9, x3);
 892     t10         = _mm256_sub_pd(t10, x4);
 893     t11         = _mm256_sub_pd(t11, y3);
 894     t12         = _mm256_sub_pd(t12, z3);
 895
 896     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 897     _mm256_storeu_pd(ptrA, t1);
 898     _mm256_storeu_pd(ptrB, t2);
 899     _mm256_storeu_pd(ptrC, t3);
 900     _mm256_storeu_pd(ptrD, t4);
 901     _mm256_storeu_pd(ptrA+4, t5);
 902     _mm256_storeu_pd(ptrB+4, t6);
 903     _mm256_storeu_pd(ptrC+4, t7);
 904     _mm256_storeu_pd(ptrD+4, t8);
 905     _mm256_storeu_pd(ptrA+8, t9);
 906     _mm256_storeu_pd(ptrB+8, t10);
 907     _mm256_storeu_pd(ptrC+8, t11);
 908     _mm256_storeu_pd(ptrD+8, t12);
 909 }
 910 #endif
 911
 912
 913
 914
 915
 916 static gmx_inline void
 917 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 918                                          double * gmx_restrict fptr,
 919                                          double * gmx_restrict fshiftptr)
 920 {
 921     __m256d t1, t2;
 922     __m128d tA, tB;
 923     fix1 = _mm256_hadd_pd(fix1, fiy1);
 924     fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
 925
 926     /* Add across the two lanes */
 927     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
 928     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
 929
 930     fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
 931
 932     t1   = _mm256_loadu_pd(fptr);
 933     t2   = _mm256_loadu_pd(fshiftptr);
 934
 935     t1   = _mm256_add_pd(t1, fix1);
 936     t2   = _mm256_add_pd(t2, fix1);
 937
 938     _mm256_storeu_pd(fptr, t1);
 939     _mm256_storeu_pd(fshiftptr, t2);
 940 }
 941
 942
 943
 944 #if defined (_MSC_VER) && defined(_M_IX86)
 945 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 946 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
 947                                                  fptr, fshiftptr) \
 948     { \
 949         __m256d _t1, _t2, _t3, _t4; \
 950         __m128d _tz3, _tA, _tB, _tC, _tD; \
 951         fix1  = _mm256_hadd_pd(fix1, fiy1); \
 952         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
 953         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
 954         fix3  = _mm256_hadd_pd(fix3, fiy3); \
 955         fiz3  = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
 956         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
 957         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
 958         _t1   = _mm256_add_pd(_t1, _t2); \
 959         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
 960         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
 961         _t3   = _mm256_add_pd(_t3, _t4); \
 962         _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
 963         _t2   = _mm256_loadu_pd(fptr); \
 964         _t4   = _mm256_loadu_pd(fptr+4); \
 965         _tA   = _mm_load_sd(fptr+8); \
 966         _t2   = _mm256_add_pd(_t2, _t1); \
 967         _t4   = _mm256_add_pd(_t4, _t3); \
 968         _tA   = _mm_add_sd(_tA, _tz3); \
 969         _mm256_storeu_pd(fptr, _t2); \
 970         _mm256_storeu_pd(fptr+4, _t4); \
 971         _mm_store_sd(fptr+8, _tA); \
 972         _tB   = _mm256_extractf128_pd(_t1, 0x1); \
 973         _tC   = _mm256_extractf128_pd(_t3, 0x1); \
 974         _tz3  = _mm_add_sd(_tz3, _tB); \
 975         _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
 976         _tz3  = _mm_add_sd(_tz3, _tD); \
 977         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
 978         _tD   = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
 979         _tC   = _mm_add_pd(_tC, _tD); \
 980         _tA   = _mm_loadu_pd(fshiftptr); \
 981         _tB   = _mm_load_sd(fshiftptr+2); \
 982         _tA   = _mm_add_pd(_tA, _tC); \
 983         _tB   = _mm_add_sd(_tB, _tz3); \
 984         _mm_storeu_pd(fshiftptr, _tA); \
 985         _mm_store_sd(fshiftptr+2, _tB); \
 986     }
 987 #else
 988 /* Real function for sane compilers */
 989 static gmx_inline void
 990 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 991                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 992                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 993                                          double * gmx_restrict fptr,
 994                                          double * gmx_restrict fshiftptr)
 995 {
 996     __m256d t1, t2, t3, t4;
 997     __m128d tz3, tA, tB, tC, tD;
 998
 999     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1000     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1001     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1002     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1003     fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
1004
1005     /* Add across the two lanes by swapping and adding back */
1006     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1);                                       /* X2a-b Z1a-b | Y1a-b X1a-b */
1007     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1);                                       /* X2c-d Z1c-d | Y1c-d X1c-d */
1008     t1   = _mm256_add_pd(t1, t2);                                                      /* x2 z1 | y1 x1 */
1009
1010     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3);                                       /* Y3a-b X3a-b | Z2a-b Y2a-b */
1011     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3);                                       /* Y3c-d X3c-d | Z2c-d Y2c-d */
1012     t3   = _mm256_add_pd(t3, t4);                                                      /* y3 x3 | z2 y2 */
1013
1014     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
1015
1016     t2   = _mm256_loadu_pd(fptr);
1017     t4   = _mm256_loadu_pd(fptr+4);
1018     tA   = _mm_load_sd(fptr+8);
1019
1020     t2   = _mm256_add_pd(t2, t1);
1021     t4   = _mm256_add_pd(t4, t3);
1022     tA   = _mm_add_sd(tA, tz3);
1023
1024     _mm256_storeu_pd(fptr, t2);
1025     _mm256_storeu_pd(fptr+4, t4);
1026     _mm_store_sd(fptr+8, tA);
1027
1028     /* Add up shift force */
1029     /* t1:   x2 z1 | y1 x1 */
1030     /* t3:   y3 x3 | z2 y2 */
1031     /* tz3:           0 z3 */
1032
1033     /* z component */
1034     tB   = _mm256_extractf128_pd(t1, 0x1);                                     /* x2 z1 */
1035     tC   = _mm256_extractf128_pd(t3, 0x1);                                     /* y3 x3 */
1036     tz3  = _mm_add_sd(tz3, tB);                                                /* 0  z1+z3 */
1037     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
1038     tz3  = _mm_add_sd(tz3, tD);                                                /* - z */
1039
1040     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t1));                         /* y1+y3 x1+x3 */
1041
1042     tD   = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
1043     tC   = _mm_add_pd(tC, tD);                                                 /* y x */
1044
1045     tA   = _mm_loadu_pd(fshiftptr);
1046     tB   = _mm_load_sd(fshiftptr+2);
1047     tA   = _mm_add_pd(tA, tC);
1048     tB   = _mm_add_sd(tB, tz3);
1049     _mm_storeu_pd(fshiftptr, tA);
1050     _mm_store_sd(fshiftptr+2, tB);
1051 }
1052 #endif
1053
1054
1055 #if defined (_MSC_VER) && defined(_M_IX86)
1056 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1057 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1058                                                  fptr, fshiftptr) \
1059     { \
1060         __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
1061         __m128d _tA, _tB, _tC, _tD; \
1062         fix1  = _mm256_hadd_pd(fix1, fiy1); \
1063         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
1064         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
1065         fix3  = _mm256_hadd_pd(fix3, fiy3); \
1066         fiz3  = _mm256_hadd_pd(fiz3, fix4); \
1067         fiy4  = _mm256_hadd_pd(fiy4, fiz4); \
1068         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
1069         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
1070         _t1   = _mm256_add_pd(_t1, _t2); \
1071         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
1072         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
1073         _t3   = _mm256_add_pd(_t3, _t4); \
1074         _t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
1075         _t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
1076         _t5   = _mm256_add_pd(_t5, _t6); \
1077         _t2   = _mm256_loadu_pd(fptr); \
1078         _t4   = _mm256_loadu_pd(fptr+4); \
1079         _t6   = _mm256_loadu_pd(fptr+8); \
1080         _t2   = _mm256_add_pd(_t2, _t1); \
1081         _t4   = _mm256_add_pd(_t4, _t3); \
1082         _t6   = _mm256_add_pd(_t6, _t5); \
1083         _mm256_storeu_pd(fptr, _t2); \
1084         _mm256_storeu_pd(fptr+4, _t4); \
1085         _mm256_storeu_pd(fptr+8, _t6); \
1086         _tA   = _mm256_extractf128_pd(_t1, 0x1); \
1087         _tB   = _mm256_extractf128_pd(_t3, 0x1); \
1088         _tC   = _mm256_extractf128_pd(_t5, 0x1); \
1089         _tB   = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
1090         _tA   = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
1091         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
1092         _tD   = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
1093         _tB   = _mm_add_pd(_tB, _tD); \
1094         _tC   = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
1095         _tC   = _mm_add_sd(_tC, _tA); \
1096         _tA   = _mm_loadu_pd(fshiftptr); \
1097         _tD   = _mm_load_sd(fshiftptr+2); \
1098         _tA   = _mm_add_pd(_tA, _tB); \
1099         _tD   = _mm_add_sd(_tD, _tC); \
1100         _mm_storeu_pd(fshiftptr, _tA); \
1101         _mm_store_sd(fshiftptr+2, _tD); \
1102     }
1103 #else
1104 /* Real function for sane compilers */
1105 static gmx_inline void
1106 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1107                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1108                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
1109                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
1110                                          double * gmx_restrict fptr,
1111                                          double * gmx_restrict fshiftptr)
1112 {
1113     __m256d t1, t2, t3, t4, t5, t6;
1114     __m128d tA, tB, tC, tD;
1115
1116     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1117     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1118     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1119     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1120     fiz3 = _mm256_hadd_pd(fiz3, fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
1121     fiy4 = _mm256_hadd_pd(fiy4, fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
1122
1123     /* Add across the two lanes by swapping and adding back */
1124     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1125     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1126     t1   = _mm256_add_pd(t1, t2);                /* x2 z1 | y1 x1 */
1127
1128     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1129     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1130     t3   = _mm256_add_pd(t3, t4);                /* y3 x3 | z2 y2 */
1131
1132     t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1133     t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1134     t5   = _mm256_add_pd(t5, t6);                /* z4 y4 | x4 z3 */
1135
1136     t2   = _mm256_loadu_pd(fptr);
1137     t4   = _mm256_loadu_pd(fptr+4);
1138     t6   = _mm256_loadu_pd(fptr+8);
1139
1140     t2   = _mm256_add_pd(t2, t1);
1141     t4   = _mm256_add_pd(t4, t3);
1142     t6   = _mm256_add_pd(t6, t5);
1143
1144     _mm256_storeu_pd(fptr, t2);
1145     _mm256_storeu_pd(fptr+4, t4);
1146     _mm256_storeu_pd(fptr+8, t6);
1147
1148     /* Add up shift force  */
1149     /* t1:   x2. z1. | y1. x1. */
1150     /* t3:   y3. x3. | z2 y2 */
1151     /* t5:   z4 y4 | x4. z3. */
1152
1153     /* z component */
1154     tA   = _mm256_extractf128_pd(t1, 0x1);                /* x2 z1 */
1155     tB   = _mm256_extractf128_pd(t3, 0x1);                /* y3 x3 */
1156     tC   = _mm256_extractf128_pd(t5, 0x1);                /* z4 y4 */
1157
1158     tB   = _mm_add_pd(tB, _mm256_castpd256_pd128(t1));    /*  y1+y3  x1+x3 */
1159     tA   = _mm_add_pd(tA, _mm256_castpd256_pd128(t5));    /*  x2+x4  z1+z3 */
1160     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t3));    /*  z4+z2  y4+y2 */
1161
1162     tD   = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1));    /* y4+y2 x2+x4 */
1163     tB   = _mm_add_pd(tB, tD);                            /* y x */
1164     tC   = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /*    - z4+z2 */
1165     tC   = _mm_add_sd(tC, tA);                            /* - z */
1166
1167     tA   = _mm_loadu_pd(fshiftptr);
1168     tD   = _mm_load_sd(fshiftptr+2);
1169     tA   = _mm_add_pd(tA, tB);
1170     tD   = _mm_add_sd(tD, tC);
1171     _mm_storeu_pd(fshiftptr, tA);
1172     _mm_store_sd(fshiftptr+2, tD);
1173 }
1174 #endif
1175
1176
1177
1178 static void
1179 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1180 {
1181     __m128d t1;
1182
1183     pot1 = _mm256_hadd_pd(pot1, pot1);
1184
1185     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1186
1187     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1188 }
1189
1190 static void
1191 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1192                          __m256d pot2, double * gmx_restrict ptrB)
1193 {
1194     __m128d t1, t2;
1195
1196     pot1 = _mm256_hadd_pd(pot1, pot2);
1197
1198     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1199
1200     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
1201     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1202     _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
1203 }
1204
1205
1206 #endif /* _kernelutil_x86_avx_256_double_h_ */