src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_256_single_h_
  30 #define _kernelutil_x86_avx_256_single_h_
  31
  32 #include "gmx_x86_avx_256.h"
  33
  34 /* Transpose lower/upper half of 256-bit registers separately */
  35 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) {            \
  36         __m256 __tmp0, __tmp1, __tmp2, __tmp3;                               \
  37                                                                       \
  38         __tmp0   = _mm256_unpacklo_ps((ymm0), (ymm1));                     \
  39         __tmp1   = _mm256_unpacklo_ps((ymm2), (ymm3));                     \
  40         __tmp2   = _mm256_unpackhi_ps((ymm0), (ymm1));                     \
  41         __tmp3   = _mm256_unpackhi_ps((ymm2), (ymm3));                     \
  42         ymm0     = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(1, 0, 1, 0)); \
  43         ymm1     = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(3, 2, 3, 2)); \
  44         ymm2     = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(1, 0, 1, 0)); \
  45         ymm3     = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(3, 2, 3, 2)); \
  46 }
  47
  48
  49 static gmx_inline __m256
  50 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
  51 {
  52     return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
  53 }
  54
  55 /* Normal sum of four ymm registers */
  56 #define gmx_mm256_sum4_ps(t0, t1, t2, t3)  _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
  57
  58
  59 static gmx_inline int
  60 gmx_mm256_any_lt(__m256 a, __m256 b)
  61 {
  62     return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
  63 }
  64
  65
  66 static gmx_inline __m256
  67 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  68                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
  69 {
  70     __m128 t1, t2;
  71
  72     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
  73     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
  74     return _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t2));
  75 }
  76
  77
  78 static gmx_inline __m256
  79 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  80                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
  81                                 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
  82                                 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
  83 {
  84     __m256 t1, t2;
  85
  86     t1 = gmx_mm256_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
  87     t2 = gmx_mm256_load_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH);
  88
  89     return _mm256_permute2f128_ps(t1, t2, 0x20);
  90 }
  91
  92
  93
  94 static gmx_inline void
  95 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
  96                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
  97 {
  98     __m256 t2, t3, t4;
  99
 100     t2       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
 101     t3       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
 102     t4       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
 103     _mm_store_ss(ptrA, _mm256_castps256_ps128(xmm1));
 104     _mm_store_ss(ptrB, _mm256_castps256_ps128(t2));
 105     _mm_store_ss(ptrC, _mm256_castps256_ps128(t3));
 106     _mm_store_ss(ptrD, _mm256_castps256_ps128(t4));
 107 }
 108
 109
 110 static gmx_inline void
 111 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 112                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 113                                  float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 114                                  float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
 115 {
 116     __m256 t1;
 117
 118     t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
 119
 120     gmx_mm256_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
 121     gmx_mm256_store_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
 122 }
 123
 124
 125 static gmx_inline void
 126 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 127                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 128                                      __m256 xmm1)
 129 {
 130     __m128 t1, t2, t3, t4;
 131
 132     t1   = _mm256_castps256_ps128(xmm1);
 133     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 134     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 135     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 136
 137     t1   = _mm_add_ss(t1, _mm_load_ss(ptrA));
 138     t2   = _mm_add_ss(t2, _mm_load_ss(ptrB));
 139     t3   = _mm_add_ss(t3, _mm_load_ss(ptrC));
 140     t4   = _mm_add_ss(t4, _mm_load_ss(ptrD));
 141
 142     _mm_store_ss(ptrA, t1);
 143     _mm_store_ss(ptrB, t2);
 144     _mm_store_ss(ptrC, t3);
 145     _mm_store_ss(ptrD, t4);
 146 }
 147
 148 static gmx_inline void
 149 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 150                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 151                                      float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 152                                      float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 153                                      __m256 xmm1)
 154 {
 155     __m256 t1;
 156
 157     t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
 158
 159     gmx_mm256_increment_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
 160     gmx_mm256_increment_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
 161 }
 162
 163
 164 static gmx_inline void
 165 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 166                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 167                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 168 {
 169     __m128 t1, t2, t3, t4;
 170
 171     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a  c6a */
 172     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b  c6b */
 173     t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c  c6c */
 174     t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d  c6d */
 175
 176     t1   = _mm_unpacklo_ps(t1, t2);                     /* c12b c12a c6b c6a */
 177     t3   = _mm_unpacklo_ps(t3, t4);                     /* c12d c12c c6d c6c */
 178
 179     *c6  = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)));
 180     *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
 181 }
 182
 183 static gmx_inline void
 184 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 185                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 186                                 const float * gmx_restrict p5, const float * gmx_restrict p6,
 187                                 const float * gmx_restrict p7, const float * gmx_restrict p8,
 188                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 189 {
 190     __m256 c6l, c6h, c12l, c12h;
 191
 192     gmx_mm256_load_4pair_swizzle_ps(p1, p2, p3, p4, &c6l, &c12l);
 193     gmx_mm256_load_4pair_swizzle_ps(p5, p6, p7, p8, &c6h, &c12h);
 194
 195     *c6  = _mm256_permute2f128_ps(c6l, c6h, 0x20);
 196     *c12 = _mm256_permute2f128_ps(c12l, c12h, 0x20);
 197 }
 198
 199
 200 static gmx_inline void
 201 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 202                                             const float * gmx_restrict xyz,
 203                                             __m256 * gmx_restrict      x1,
 204                                             __m256 * gmx_restrict      y1,
 205                                             __m256 * gmx_restrict      z1)
 206 {
 207     __m128 t1, t2, t3, t4;
 208
 209     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 210     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
 211     t3   = _mm_load_ss(xyz_shift+2);
 212     t4   = _mm_load_ss(xyz+2);
 213     t1   = _mm_add_ps(t1, t2);
 214     t3   = _mm_add_ss(t3, t4);
 215
 216     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 217     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 218     t3   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 219
 220     *x1  = gmx_mm256_set_m128(t1, t1);
 221     *y1  = gmx_mm256_set_m128(t2, t2);
 222     *z1  = gmx_mm256_set_m128(t3, t3);
 223 }
 224
 225
 226 static gmx_inline void
 227 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 228                                             const float * gmx_restrict xyz,
 229                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 230                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 231                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 232 {
 233     __m128 tA, tB;
 234     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9;
 235
 236     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 237     tB   = _mm_load_ss(xyz_shift+2);
 238
 239     t1   = _mm_loadu_ps(xyz);
 240     t2   = _mm_loadu_ps(xyz+4);
 241     t3   = _mm_load_ss(xyz+8);
 242
 243     tA   = _mm_movelh_ps(tA, tB);
 244     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 245     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 246     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 247
 248     t1   = _mm_add_ps(t1, t4);
 249     t2   = _mm_add_ps(t2, t5);
 250     t3   = _mm_add_ss(t3, t6);
 251
 252     t9   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 253     t8   = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 254     t7   = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 255     t6   = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 256     t5   = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 257     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 258     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 259     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 260     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 261
 262     *x1  = gmx_mm256_set_m128(t1, t1);
 263     *y1  = gmx_mm256_set_m128(t2, t2);
 264     *z1  = gmx_mm256_set_m128(t3, t3);
 265     *x2  = gmx_mm256_set_m128(t4, t4);
 266     *y2  = gmx_mm256_set_m128(t5, t5);
 267     *z2  = gmx_mm256_set_m128(t6, t6);
 268     *x3  = gmx_mm256_set_m128(t7, t7);
 269     *y3  = gmx_mm256_set_m128(t8, t8);
 270     *z3  = gmx_mm256_set_m128(t9, t9);
 271 }
 272
 273
 274 static gmx_inline void
 275 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 276                                             const float * gmx_restrict xyz,
 277                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 278                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 279                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 280                                             __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 281 {
 282     __m128 tA, tB;
 283     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 284
 285     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 286     tB   = _mm_load_ss(xyz_shift+2);
 287
 288     t1   = _mm_loadu_ps(xyz);
 289     t2   = _mm_loadu_ps(xyz+4);
 290     t3   = _mm_loadu_ps(xyz+8);
 291
 292     tA   = _mm_movelh_ps(tA, tB);
 293     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 294     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 295     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 296
 297     t1   = _mm_add_ps(t1, t4);
 298     t2   = _mm_add_ps(t2, t5);
 299     t3   = _mm_add_ps(t3, t6);
 300
 301     t12  = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
 302     t11  = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
 303     t10  = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
 304     t9   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 305     t8   = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 306     t7   = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 307     t6   = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 308     t5   = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 309     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 310     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 311     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 312     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 313
 314     *x1  = gmx_mm256_set_m128(t1, t1);
 315     *y1  = gmx_mm256_set_m128(t2, t2);
 316     *z1  = gmx_mm256_set_m128(t3, t3);
 317     *x2  = gmx_mm256_set_m128(t4, t4);
 318     *y2  = gmx_mm256_set_m128(t5, t5);
 319     *z2  = gmx_mm256_set_m128(t6, t6);
 320     *x3  = gmx_mm256_set_m128(t7, t7);
 321     *y3  = gmx_mm256_set_m128(t8, t8);
 322     *z3  = gmx_mm256_set_m128(t9, t9);
 323     *x4  = gmx_mm256_set_m128(t10, t10);
 324     *y4  = gmx_mm256_set_m128(t11, t11);
 325     *z4  = gmx_mm256_set_m128(t12, t12);
 326 }
 327
 328
 329
 330 static gmx_inline void
 331 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 332                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 333                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 334 {
 335     __m128  t1, t2, t3, t4;
 336     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 337     t1             = gmx_mm_maskload_ps(ptrA, mask);
 338     t2             = gmx_mm_maskload_ps(ptrB, mask);
 339     t3             = gmx_mm_maskload_ps(ptrC, mask);
 340     t4             = gmx_mm_maskload_ps(ptrD, mask);
 341     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 342     *x1           = _mm256_castps128_ps256(t1);
 343     *y1           = _mm256_castps128_ps256(t2);
 344     *z1           = _mm256_castps128_ps256(t3);
 345 }
 346
 347
 348 static gmx_inline void
 349 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 350                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 351                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 352                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 353                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 354 {
 355     __m128 t1, t2, t3, t4;
 356     t1            = _mm_loadu_ps(ptrA);
 357     t2            = _mm_loadu_ps(ptrB);
 358     t3            = _mm_loadu_ps(ptrC);
 359     t4            = _mm_loadu_ps(ptrD);
 360     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 361     *x1           = _mm256_castps128_ps256(t1);
 362     *y1           = _mm256_castps128_ps256(t2);
 363     *z1           = _mm256_castps128_ps256(t3);
 364     *x2           = _mm256_castps128_ps256(t4);
 365     t1            = _mm_loadu_ps(ptrA+4);
 366     t2            = _mm_loadu_ps(ptrB+4);
 367     t3            = _mm_loadu_ps(ptrC+4);
 368     t4            = _mm_loadu_ps(ptrD+4);
 369     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 370     *y2           = _mm256_castps128_ps256(t1);
 371     *z2           = _mm256_castps128_ps256(t2);
 372     *x3           = _mm256_castps128_ps256(t3);
 373     *y3           = _mm256_castps128_ps256(t4);
 374     t1            = _mm_load_ss(ptrA+8);
 375     t2            = _mm_load_ss(ptrB+8);
 376     t3            = _mm_load_ss(ptrC+8);
 377     t4            = _mm_load_ss(ptrD+8);
 378     t1            = _mm_unpacklo_ps(t1, t3);
 379     t3            = _mm_unpacklo_ps(t2, t4);
 380     *z3           = _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t3));
 381 }
 382
 383
 384
 385 static gmx_inline void
 386 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 387                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 388                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 389                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 390                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 391                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 392 {
 393     __m128 t1, t2, t3, t4;
 394     t1            = _mm_loadu_ps(ptrA);
 395     t2            = _mm_loadu_ps(ptrB);
 396     t3            = _mm_loadu_ps(ptrC);
 397     t4            = _mm_loadu_ps(ptrD);
 398     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 399     *x1           = _mm256_castps128_ps256(t1);
 400     *y1           = _mm256_castps128_ps256(t2);
 401     *z1           = _mm256_castps128_ps256(t3);
 402     *x2           = _mm256_castps128_ps256(t4);
 403     t1            = _mm_loadu_ps(ptrA+4);
 404     t2            = _mm_loadu_ps(ptrB+4);
 405     t3            = _mm_loadu_ps(ptrC+4);
 406     t4            = _mm_loadu_ps(ptrD+4);
 407     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 408     *y2           = _mm256_castps128_ps256(t1);
 409     *z2           = _mm256_castps128_ps256(t2);
 410     *x3           = _mm256_castps128_ps256(t3);
 411     *y3           = _mm256_castps128_ps256(t4);
 412     t1            = _mm_loadu_ps(ptrA+8);
 413     t2            = _mm_loadu_ps(ptrB+8);
 414     t3            = _mm_loadu_ps(ptrC+8);
 415     t4            = _mm_loadu_ps(ptrD+8);
 416     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 417     *z3           = _mm256_castps128_ps256(t1);
 418     *x4           = _mm256_castps128_ps256(t2);
 419     *y4           = _mm256_castps128_ps256(t3);
 420     *z4           = _mm256_castps128_ps256(t4);
 421 }
 422
 423
 424 static gmx_inline void
 425 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 426                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 427                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 428                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 429                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 430 {
 431     __m256  t1, t2, t3, t4, t5, t6, t7, t8;
 432     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 433
 434     t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask)); /*  - zE yE xE |  - zA yA xA */
 435     t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask)); /*  - zF yF xF |  - zB yB xB */
 436     t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask)); /*  - zG yG xG |  - zC yC xC */
 437     t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask)); /*  - zH yH xH |  - zD yD xD */
 438
 439     t5            = _mm256_unpacklo_ps(t1, t2);                                                          /* yF yE xF xE | yB yA xB xA */
 440     t6            = _mm256_unpacklo_ps(t3, t4);                                                          /* yH yG xH xG | yD yC xD xC */
 441     t7            = _mm256_unpackhi_ps(t1, t2);                                                          /*  -  - zF zE |  -  - zB zA */
 442     t8            = _mm256_unpackhi_ps(t3, t4);                                                          /*  -  - zH zG |  -  - zD zC */
 443
 444     *x1           = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(1, 0, 1, 0));
 445     *y1           = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(3, 2, 3, 2));
 446     *z1           = _mm256_shuffle_ps(t7, t8, _MM_SHUFFLE(1, 0, 1, 0));
 447 }
 448
 449
 450 static gmx_inline void
 451 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 452                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 453                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 454                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 455                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 456                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 457                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 458 {
 459     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 460
 461     t1           = _mm256_loadu_ps(ptrA);                                /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 462     t2           = _mm256_loadu_ps(ptrB);                                /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 463     t3           = _mm256_loadu_ps(ptrC);                                /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 464     t4           = _mm256_loadu_ps(ptrD);                                /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 465     t5           = _mm256_loadu_ps(ptrE);                                /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 466     t6           = _mm256_loadu_ps(ptrF);                                /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 467     t7           = _mm256_loadu_ps(ptrG);                                /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 468     t8           = _mm256_loadu_ps(ptrH);                                /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 469
 470     t9           = _mm256_unpacklo_ps(t1, t2);                           /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 471     t10          = _mm256_unpackhi_ps(t1, t2);                           /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 472     t11          = _mm256_unpacklo_ps(t3, t4);                           /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 473     t12          = _mm256_unpackhi_ps(t3, t4);                           /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 474     t1           = _mm256_unpacklo_ps(t5, t6);                           /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 475     t2           = _mm256_unpackhi_ps(t5, t6);                           /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 476     t3           = _mm256_unpacklo_ps(t7, t8);                           /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 477     t4           = _mm256_unpackhi_ps(t7, t8);                           /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 478
 479     t5           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0));  /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 480     t6           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2));  /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 481     t7           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 482     t8           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 483
 484     t9           = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));   /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 485     t10          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));   /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 486     t11          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0));   /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 487     t12          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2));   /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 488
 489     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 490     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 491     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 492     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 493
 494     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 495     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 496     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 497     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 498
 499     t1           = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
 500     t2           = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
 501     t3           = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
 502     t4           = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
 503
 504     t1           = _mm256_unpacklo_ps(t1, t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
 505     t2           = _mm256_unpacklo_ps(t2, t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
 506
 507     *z3          = _mm256_unpacklo_ps(t1, t2);
 508 }
 509
 510
 511
 512 static gmx_inline void
 513 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 514                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 515                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 516                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 517                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 518                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 519                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 520                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 521 {
 522     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 523
 524     t1           = _mm256_loadu_ps(ptrA);                                /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 525     t2           = _mm256_loadu_ps(ptrB);                                /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 526     t3           = _mm256_loadu_ps(ptrC);                                /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 527     t4           = _mm256_loadu_ps(ptrD);                                /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 528     t5           = _mm256_loadu_ps(ptrE);                                /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 529     t6           = _mm256_loadu_ps(ptrF);                                /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 530     t7           = _mm256_loadu_ps(ptrG);                                /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 531     t8           = _mm256_loadu_ps(ptrH);                                /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 532
 533     t9           = _mm256_unpacklo_ps(t1, t2);                           /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 534     t10          = _mm256_unpackhi_ps(t1, t2);                           /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 535     t11          = _mm256_unpacklo_ps(t3, t4);                           /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 536     t12          = _mm256_unpackhi_ps(t3, t4);                           /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 537     t1           = _mm256_unpacklo_ps(t5, t6);                           /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 538     t2           = _mm256_unpackhi_ps(t5, t6);                           /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 539     t3           = _mm256_unpacklo_ps(t7, t8);                           /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 540     t4           = _mm256_unpackhi_ps(t7, t8);                           /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 541
 542     t5           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0));  /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 543     t6           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2));  /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 544     t7           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 545     t8           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 546     t9           = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));   /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 547     t10          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));   /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 548     t11          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0));   /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 549     t12          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2));   /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 550
 551     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 552     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 553     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 554     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 555
 556     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 557     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 558     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 559     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 560
 561     t1           = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
 562     t2           = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
 563     t3           = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
 564     t4           = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
 565
 566     t5           = _mm256_unpacklo_ps(t1, t2);                                     /* x4f x4e z3f z3e | x4b x4a z3b z3a */
 567     t6           = _mm256_unpackhi_ps(t1, t2);                                     /* z4f z4e y4f y4e | z4b z4a y4b y4a */
 568     t7           = _mm256_unpacklo_ps(t3, t4);                                     /* x4h x4g z3h z3g | x4d x4c z3d z3c */
 569     t8           = _mm256_unpackhi_ps(t3, t4);                                     /* z4h z4g y4h y4g | z4d z4c y4d y4c */
 570
 571     *z3          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));             /* z3h z3g z3f z3e | z3d z3c z3b z3a */
 572     *x4          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));             /* x4h x4g x4f x4e | x4d x4c x4b x4a */
 573     *y4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0));             /* y4h y4g y4f y4e | y4d y4c y4b y4a */
 574     *z4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2));             /* z4h z4g z4f z4e | z4d z4c z4b z4a */
 575 }
 576
 577
 578 static gmx_inline void
 579 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 580                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 581                                           __m256 x1, __m256 y1, __m256 z1)
 582 {
 583     __m128  t1, t2, t3, t4, t5, t6, t7, t8;
 584     __m128i mask;
 585
 586     /* Construct a mask without executing any data loads */
 587     mask        = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
 588
 589     t3          = _mm_unpacklo_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
 590     t4          = _mm_unpackhi_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
 591
 592     t1          = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 0, 1, 0)); /*  -  z1a y1a x1a */
 593     t2          = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 1, 3, 2)); /*  -  z1b y1b x1b */
 594     t3          = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 2, 1, 0)); /*  -  z1c y1c x1c */
 595     t4          = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 3, 3, 2)); /*  -  z1d y1d x1d */
 596
 597     t5          = gmx_mm_maskload_ps(ptrA, mask);
 598     t6          = gmx_mm_maskload_ps(ptrB, mask);
 599     t7          = gmx_mm_maskload_ps(ptrC, mask);
 600     t8          = gmx_mm_maskload_ps(ptrD, mask);
 601
 602     t5          = _mm_sub_ps(t5, t1);
 603     t6          = _mm_sub_ps(t6, t2);
 604     t7          = _mm_sub_ps(t7, t3);
 605     t8          = _mm_sub_ps(t8, t4);
 606
 607     gmx_mm_maskstore_ps(ptrA, mask, t5);
 608     gmx_mm_maskstore_ps(ptrB, mask, t6);
 609     gmx_mm_maskstore_ps(ptrC, mask, t7);
 610     gmx_mm_maskstore_ps(ptrD, mask, t8);
 611 }
 612
 613 #if defined (_MSC_VER) && defined(_M_IX86)
 614 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 615 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 616                                                   x1, y1, z1, x2, y2, z2, x3, y3, z3) \
 617     { \
 618         __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
 619         __m128 _tA, _tB, _tC, _tD; \
 620 \
 621         _t1         = _mm256_loadu_ps(ptrA); \
 622         _t2         = _mm256_loadu_ps(ptrB); \
 623         _t3         = _mm256_loadu_ps(ptrC); \
 624         _t4         = _mm256_loadu_ps(ptrD); \
 625         _tA         = _mm_load_ss(ptrA+8); \
 626         _tB         = _mm_load_ss(ptrB+8); \
 627         _tC         = _mm_load_ss(ptrC+8); \
 628         _tD         = _mm_load_ss(ptrD+8); \
 629         _t5         = _mm256_unpacklo_ps(x1, y1); \
 630         x1          = _mm256_unpackhi_ps(x1, y1); \
 631         y1          = _mm256_unpacklo_ps(z1, x2); \
 632         z1          = _mm256_unpackhi_ps(z1, x2); \
 633         x2          = _mm256_unpacklo_ps(y2, z2); \
 634         y2          = _mm256_unpackhi_ps(y2, z2); \
 635         _t6         = _mm256_unpacklo_ps(x3, y3); \
 636         x3          = _mm256_unpackhi_ps(x3, y3); \
 637         _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
 638         x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
 639         y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
 640         z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
 641         z2          = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
 642         _t5         = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
 643         y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
 644         x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
 645         _t1         = _mm256_sub_ps(_t1, z2); \
 646         _t2         = _mm256_sub_ps(_t2, _t5); \
 647         _t3         = _mm256_sub_ps(_t3, y1); \
 648         _t4         = _mm256_sub_ps(_t4, x1); \
 649         _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
 650         _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
 651         _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
 652         _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
 653         _mm256_storeu_ps(ptrA, _t1); \
 654         _mm256_storeu_ps(ptrB, _t2); \
 655         _mm256_storeu_ps(ptrC, _t3); \
 656         _mm256_storeu_ps(ptrD, _t4); \
 657         _mm_store_ss(ptrA+8, _tA); \
 658         _mm_store_ss(ptrB+8, _tB); \
 659         _mm_store_ss(ptrC+8, _tC); \
 660         _mm_store_ss(ptrD+8, _tD); \
 661     }
 662 #else
 663 /* Real function for sane compilers */
 664 static gmx_inline void
 665 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 666                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 667                                           __m256 x1, __m256 y1, __m256 z1,
 668                                           __m256 x2, __m256 y2, __m256 z2,
 669                                           __m256 x3, __m256 y3, __m256 z3)
 670 {
 671     __m256 t1, t2, t3, t4, t5, t6;
 672     __m128 tA, tB, tC, tD;
 673
 674     t1          = _mm256_loadu_ps(ptrA);
 675     t2          = _mm256_loadu_ps(ptrB);
 676     t3          = _mm256_loadu_ps(ptrC);
 677     t4          = _mm256_loadu_ps(ptrD);
 678     tA          = _mm_load_ss(ptrA+8);
 679     tB          = _mm_load_ss(ptrB+8);
 680     tC          = _mm_load_ss(ptrC+8);
 681     tD          = _mm_load_ss(ptrD+8);
 682
 683     t5          = _mm256_unpacklo_ps(x1, y1);                                /* - - - - | y1b x1b y1a x1a */
 684     x1          = _mm256_unpackhi_ps(x1, y1);                                /* - - - - | y1d x1d y1c x1c */
 685     y1          = _mm256_unpacklo_ps(z1, x2);                                /* - - - - | x2b z1b x2a z1a */
 686     z1          = _mm256_unpackhi_ps(z1, x2);                                /* - - - - | x2d z1d x2c z1c */
 687
 688     x2          = _mm256_unpacklo_ps(y2, z2);                                /* - - - - | z2b y2b z2a y2a */
 689     y2          = _mm256_unpackhi_ps(y2, z2);                                /* - - - - | z2d y2d z2c y2c */
 690     t6          = _mm256_unpacklo_ps(x3, y3);                                /* - - - - | y3b x3b y3a x3a */
 691     x3          = _mm256_unpackhi_ps(x3, y3);                                /* - - - - | y3d x3d y3c x3c */
 692
 693     t5          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 694     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 695
 696     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 697     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 698
 699     z2          = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(1, 0, 1, 0));        /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 700     t5          = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(3, 2, 3, 2));        /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 701     y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0));        /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 702     x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2));        /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 703
 704     t1          = _mm256_sub_ps(t1, z2);
 705     t2          = _mm256_sub_ps(t2, t5);
 706     t3          = _mm256_sub_ps(t3, y1);
 707     t4          = _mm256_sub_ps(t4, x1);
 708
 709     tA          = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
 710     tB          = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1)));
 711     tC          = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2)));
 712     tD          = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3)));
 713
 714     /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
 715     _mm256_storeu_ps(ptrA, t1);
 716     _mm256_storeu_ps(ptrB, t2);
 717     _mm256_storeu_ps(ptrC, t3);
 718     _mm256_storeu_ps(ptrD, t4);
 719     _mm_store_ss(ptrA+8, tA);
 720     _mm_store_ss(ptrB+8, tB);
 721     _mm_store_ss(ptrC+8, tC);
 722     _mm_store_ss(ptrD+8, tD);
 723 }
 724 #endif
 725
 726
 727
 728 #if defined (_MSC_VER) && defined(_M_IX86)
 729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 730 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 731                                                   x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
 732     { \
 733         __m256 _t1, _t2, _t3, _t4, _t5; \
 734         __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
 735 \
 736         _t1         = _mm256_loadu_ps(ptrA); \
 737         _t2         = _mm256_loadu_ps(ptrB); \
 738         _t3         = _mm256_loadu_ps(ptrC); \
 739         _t4         = _mm256_loadu_ps(ptrD); \
 740         _tA         = _mm_loadu_ps(ptrA+8); \
 741         _tB         = _mm_loadu_ps(ptrB+8); \
 742         _tC         = _mm_loadu_ps(ptrC+8); \
 743         _tD         = _mm_loadu_ps(ptrD+8); \
 744         _t5         = _mm256_unpacklo_ps(x1, y1); \
 745         x1          = _mm256_unpackhi_ps(x1, y1); \
 746         y1          = _mm256_unpacklo_ps(z1, x2); \
 747         z1          = _mm256_unpackhi_ps(z1, x2); \
 748         x2          = _mm256_unpacklo_ps(y2, z2); \
 749         y2          = _mm256_unpackhi_ps(y2, z2); \
 750         z2          = _mm256_unpacklo_ps(x3, y3); \
 751         x3          = _mm256_unpackhi_ps(x3, y3); \
 752         y3          = _mm256_unpacklo_ps(z3, x4); \
 753         z3          = _mm256_unpackhi_ps(z3, x4); \
 754         x4          = _mm256_unpacklo_ps(y4, z4); \
 755         y4          = _mm256_unpackhi_ps(y4, z4); \
 756         x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
 757         x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
 758         y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
 759         z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
 760         z2          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
 761         _t5         = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
 762         y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
 763         x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
 764         _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
 765         _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
 766         _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
 767         _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
 768         _t1         = _mm256_sub_ps(_t1, z2); \
 769         _t2         = _mm256_sub_ps(_t2, _t5); \
 770         _t3         = _mm256_sub_ps(_t3, y1); \
 771         _t4         = _mm256_sub_ps(_t4, x1); \
 772         _tA         = _mm_sub_ps(_tA, _tE); \
 773         _tB         = _mm_sub_ps(_tB, _tF); \
 774         _tC         = _mm_sub_ps(_tC, _tG); \
 775         _tD         = _mm_sub_ps(_tD, _tH); \
 776         _mm256_storeu_ps(ptrA, _t1); \
 777         _mm256_storeu_ps(ptrB, _t2); \
 778         _mm256_storeu_ps(ptrC, _t3); \
 779         _mm256_storeu_ps(ptrD, _t4); \
 780         _mm_storeu_ps(ptrA+8, _tA); \
 781         _mm_storeu_ps(ptrB+8, _tB); \
 782         _mm_storeu_ps(ptrC+8, _tC); \
 783         _mm_storeu_ps(ptrD+8, _tD); \
 784     }
 785 #else
 786 /* Real function for sane compilers */
 787 static gmx_inline void
 788 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 789                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 790                                           __m256 x1, __m256 y1, __m256 z1,
 791                                           __m256 x2, __m256 y2, __m256 z2,
 792                                           __m256 x3, __m256 y3, __m256 z3,
 793                                           __m256 x4, __m256 y4, __m256 z4)
 794 {
 795     __m256 t1, t2, t3, t4, t5;
 796     __m128 tA, tB, tC, tD, tE, tF, tG, tH;
 797
 798     t1          = _mm256_loadu_ps(ptrA);
 799     t2          = _mm256_loadu_ps(ptrB);
 800     t3          = _mm256_loadu_ps(ptrC);
 801     t4          = _mm256_loadu_ps(ptrD);
 802     tA          = _mm_loadu_ps(ptrA+8);
 803     tB          = _mm_loadu_ps(ptrB+8);
 804     tC          = _mm_loadu_ps(ptrC+8);
 805     tD          = _mm_loadu_ps(ptrD+8);
 806
 807     t5          = _mm256_unpacklo_ps(x1, y1);                                                                      /* - - - - | y1b x1b y1a x1a */
 808     x1          = _mm256_unpackhi_ps(x1, y1);                                                                      /* - - - - | y1d x1d y1c x1c */
 809     y1          = _mm256_unpacklo_ps(z1, x2);                                                                      /* - - - - | x2b z1b x2a z1a */
 810     z1          = _mm256_unpackhi_ps(z1, x2);                                                                      /* - - - - | x2d z1d x2c z1c */
 811
 812     x2          = _mm256_unpacklo_ps(y2, z2);                                                                      /* - - - - | z2b y2b z2a y2a */
 813     y2          = _mm256_unpackhi_ps(y2, z2);                                                                      /* - - - - | z2d y2d z2c y2c */
 814     z2          = _mm256_unpacklo_ps(x3, y3);                                                                      /* - - - - | y3b x3b y3a x3a */
 815     x3          = _mm256_unpackhi_ps(x3, y3);                                                                      /* - - - - | y3d x3d y3c x3c */
 816
 817     y3          = _mm256_unpacklo_ps(z3, x4);                                                                      /* - - - - | x4b z3b x4a z3a */
 818     z3          = _mm256_unpackhi_ps(z3, x4);                                                                      /* - - - - | x4d z3d x4c z3c */
 819     x4          = _mm256_unpacklo_ps(y4, z4);                                                                      /* - - - - | z4b y4b z4a y4a */
 820     y4          = _mm256_unpackhi_ps(y4, z4);                                                                      /* - - - - | z4d y4d z4c y4c */
 821
 822     x2          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1);                                       /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 823     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);                                       /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 824     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);                                       /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 825     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);                                       /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 826
 827     z2          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0));                                              /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 828     t5          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2));                                              /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 829     y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0));                                              /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 830     x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2));                                              /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 831
 832     tE          = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4a y4a x4a z3a */
 833     tF          = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4b y4b x4b z3b */
 834
 835     tG          = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4c y4c x4c z3c */
 836     tH          = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4d y4d x4d z3d */
 837
 838     t1          = _mm256_sub_ps(t1, z2);
 839     t2          = _mm256_sub_ps(t2, t5);
 840     t3          = _mm256_sub_ps(t3, y1);
 841     t4          = _mm256_sub_ps(t4, x1);
 842
 843     tA          = _mm_sub_ps(tA, tE);
 844     tB          = _mm_sub_ps(tB, tF);
 845     tC          = _mm_sub_ps(tC, tG);
 846     tD          = _mm_sub_ps(tD, tH);
 847
 848     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 849     _mm256_storeu_ps(ptrA, t1);
 850     _mm256_storeu_ps(ptrB, t2);
 851     _mm256_storeu_ps(ptrC, t3);
 852     _mm256_storeu_ps(ptrD, t4);
 853     _mm_storeu_ps(ptrA+8, tA);
 854     _mm_storeu_ps(ptrB+8, tB);
 855     _mm_storeu_ps(ptrC+8, tC);
 856     _mm_storeu_ps(ptrD+8, tD);
 857 }
 858 #endif
 859
 860
 861 static gmx_inline void
 862 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 863                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 864                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 865                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 866                                           __m256 x1, __m256 y1, __m256 z1)
 867 {
 868     __m256  t1, t2, t3, t4, t5, t6;
 869     __m256  tA, tB, tC, tD;
 870     __m128i mask;
 871
 872     /* Construct a mask without executing any data loads */
 873     mask        = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
 874
 875     tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask));
 876     tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask));
 877     tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask));
 878     tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask));
 879     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
 880     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 881
 882     t3          = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 0, 1, 0)); /*  -  z1e y1e x1e |  - z1a y1a x1a */
 883     t4          = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 1, 3, 2)); /*  -  z1f y1f x1f |  - z1b y1b x1b */
 884     t5          = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 2, 1, 0)); /*  -  z1g y1g x1g |  - z1c y1c x1c */
 885     t6          = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 3, 3, 2)); /*  -  z1h y1h x1h |  - z1d y1d x1d */
 886
 887     tA          = _mm256_sub_ps(tA, t3);
 888     tB          = _mm256_sub_ps(tB, t4);
 889     tC          = _mm256_sub_ps(tC, t5);
 890     tD          = _mm256_sub_ps(tD, t6);
 891
 892     gmx_mm_maskstore_ps(ptrA, mask, _mm256_castps256_ps128(tA));
 893     gmx_mm_maskstore_ps(ptrB, mask, _mm256_castps256_ps128(tB));
 894     gmx_mm_maskstore_ps(ptrC, mask, _mm256_castps256_ps128(tC));
 895     gmx_mm_maskstore_ps(ptrD, mask, _mm256_castps256_ps128(tD));
 896     gmx_mm_maskstore_ps(ptrE, mask, _mm256_extractf128_ps(tA, 0x1));
 897     gmx_mm_maskstore_ps(ptrF, mask, _mm256_extractf128_ps(tB, 0x1));
 898     gmx_mm_maskstore_ps(ptrG, mask, _mm256_extractf128_ps(tC, 0x1));
 899     gmx_mm_maskstore_ps(ptrH, mask, _mm256_extractf128_ps(tD, 0x1));
 900 }
 901
 902
 903
 904 #if defined (_MSC_VER) && defined(_M_IX86)
 905 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 906 #define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 907     { \
 908         __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
 909         __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
 910 \
 911         _tA         = _mm256_loadu_ps(ptrA); \
 912         _tB         = _mm256_loadu_ps(ptrB); \
 913         _tC         = _mm256_loadu_ps(ptrC); \
 914         _tD         = _mm256_loadu_ps(ptrD); \
 915         _tE         = _mm256_loadu_ps(ptrE); \
 916         _tF         = _mm256_loadu_ps(ptrF); \
 917         _tG         = _mm256_loadu_ps(ptrG); \
 918         _tH         = _mm256_loadu_ps(ptrH); \
 919         _t1         = _mm256_unpacklo_ps(_x1, _y1); \
 920         _t2         = _mm256_unpackhi_ps(_x1, _y1); \
 921         _t3         = _mm256_unpacklo_ps(_z1, _x2); \
 922         _t4         = _mm256_unpackhi_ps(_z1, _x2); \
 923         _t5         = _mm256_unpacklo_ps(_y2, _z2); \
 924         _t6         = _mm256_unpackhi_ps(_y2, _z2); \
 925         _t7         = _mm256_unpacklo_ps(_x3, _y3); \
 926         _t8         = _mm256_unpackhi_ps(_x3, _y3); \
 927         _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
 928         _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
 929         _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
 930         _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
 931         _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
 932         _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
 933         _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
 934         _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
 935         _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
 936         _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
 937         _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
 938         _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
 939         _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
 940         _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
 941         _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
 942         _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
 943         _tA         = _mm256_sub_ps(_tA, _t5); \
 944         _tB         = _mm256_sub_ps(_tB, _t7); \
 945         _tC         = _mm256_sub_ps(_tC, _t1); \
 946         _tD         = _mm256_sub_ps(_tD, _t9); \
 947         _tE         = _mm256_sub_ps(_tE, _t6); \
 948         _tF         = _mm256_sub_ps(_tF, _t8); \
 949         _tG         = _mm256_sub_ps(_tG, _t2); \
 950         _tH         = _mm256_sub_ps(_tH, _t10); \
 951         _mm256_storeu_ps(ptrA, _tA); \
 952         _mm256_storeu_ps(ptrB, _tB); \
 953         _mm256_storeu_ps(ptrC, _tC); \
 954         _mm256_storeu_ps(ptrD, _tD); \
 955         _mm256_storeu_ps(ptrE, _tE); \
 956         _mm256_storeu_ps(ptrF, _tF); \
 957         _mm256_storeu_ps(ptrG, _tG); \
 958         _mm256_storeu_ps(ptrH, _tH); \
 959         _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
 960         _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
 961         _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
 962         _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
 963         _tI         = _mm256_unpacklo_ps(_tI, _tK); \
 964         _tJ         = _mm256_unpacklo_ps(_tJ, _tL); \
 965         _tI         = _mm256_unpacklo_ps(_tI, _tJ); \
 966         _tI         = _mm256_sub_ps(_tI, _z3); \
 967         _tJ         = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
 968         _tK         = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
 969         _tL         = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
 970         _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
 971         _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
 972         _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
 973         _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
 974         _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
 975         _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
 976         _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
 977         _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
 978     }
 979 #else
 980 /* Real function for sane compilers */
 981 static gmx_inline void
 982 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 983                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 984                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 985                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 986                                           __m256 x1, __m256 y1, __m256 z1,
 987                                           __m256 x2, __m256 y2, __m256 z2,
 988                                           __m256 x3, __m256 y3, __m256 z3)
 989 {
 990     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 991     __m256 tA, tB, tC, tD, tE, tF, tG, tH;
 992     __m256 tI, tJ, tK, tL;
 993
 994     tA          = _mm256_loadu_ps(ptrA);
 995     tB          = _mm256_loadu_ps(ptrB);
 996     tC          = _mm256_loadu_ps(ptrC);
 997     tD          = _mm256_loadu_ps(ptrD);
 998     tE          = _mm256_loadu_ps(ptrE);
 999     tF          = _mm256_loadu_ps(ptrF);
1000     tG          = _mm256_loadu_ps(ptrG);
1001     tH          = _mm256_loadu_ps(ptrH);
1002
1003     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1004     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1005     t3          = _mm256_unpacklo_ps(z1, x2);                         /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1006     t4          = _mm256_unpackhi_ps(z1, x2);                         /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1007
1008     t5          = _mm256_unpacklo_ps(y2, z2);                         /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1009     t6          = _mm256_unpackhi_ps(y2, z2);                         /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1010     t7          = _mm256_unpacklo_ps(x3, y3);                         /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1011     t8          = _mm256_unpackhi_ps(x3, y3);                         /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1012
1013     t9          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1014     t10         = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1015     t11         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1016     t12         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1017
1018     t1          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1019     t2          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1020     t3          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1021     t4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1022
1023     t5          = gmx_mm256_unpack128lo_ps(t9, t1);                   /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1024     t6          = gmx_mm256_unpack128hi_ps(t9, t1);                   /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1025     t7          = gmx_mm256_unpack128lo_ps(t10, t2);                  /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1026     t8          = gmx_mm256_unpack128hi_ps(t10, t2);                  /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1027     t1          = gmx_mm256_unpack128lo_ps(t11, t3);                  /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1028     t2          = gmx_mm256_unpack128hi_ps(t11, t3);                  /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1029     t9          = gmx_mm256_unpack128lo_ps(t12, t4);                  /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1030     t10         = gmx_mm256_unpack128hi_ps(t12, t4);                  /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1031
1032     tA          = _mm256_sub_ps(tA, t5);
1033     tB          = _mm256_sub_ps(tB, t7);
1034     tC          = _mm256_sub_ps(tC, t1);
1035     tD          = _mm256_sub_ps(tD, t9);
1036     tE          = _mm256_sub_ps(tE, t6);
1037     tF          = _mm256_sub_ps(tF, t8);
1038     tG          = _mm256_sub_ps(tG, t2);
1039     tH          = _mm256_sub_ps(tH, t10);
1040
1041     _mm256_storeu_ps(ptrA, tA);
1042     _mm256_storeu_ps(ptrB, tB);
1043     _mm256_storeu_ps(ptrC, tC);
1044     _mm256_storeu_ps(ptrD, tD);
1045     _mm256_storeu_ps(ptrE, tE);
1046     _mm256_storeu_ps(ptrF, tF);
1047     _mm256_storeu_ps(ptrG, tG);
1048     _mm256_storeu_ps(ptrH, tH);
1049
1050     tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
1051     tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
1052     tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
1053     tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
1054
1055     tI          = _mm256_unpacklo_ps(tI, tK);  /*  -  - zG zE |  -  - zC zA */
1056     tJ          = _mm256_unpacklo_ps(tJ, tL);  /*  -  - zH zF |  -  - zD zB */
1057     tI          = _mm256_unpacklo_ps(tI, tJ);  /* zH zG zF zE | zD zC zB zA */
1058
1059     tI          = _mm256_sub_ps(tI, z3);
1060     tJ          = _mm256_permute_ps(tI, _MM_SHUFFLE(1, 1, 1, 1));
1061     tK          = _mm256_permute_ps(tI, _MM_SHUFFLE(2, 2, 2, 2));
1062     tL          = _mm256_permute_ps(tI, _MM_SHUFFLE(3, 3, 3, 3));
1063
1064     _mm_store_ss(ptrA+8, _mm256_castps256_ps128(tI));
1065     _mm_store_ss(ptrB+8, _mm256_castps256_ps128(tJ));
1066     _mm_store_ss(ptrC+8, _mm256_castps256_ps128(tK));
1067     _mm_store_ss(ptrD+8, _mm256_castps256_ps128(tL));
1068     _mm_store_ss(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1069     _mm_store_ss(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1070     _mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1071     _mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1072 }
1073 #endif
1074
1075
1076
1077 #if defined (_MSC_VER) && defined(_M_IX86)
1078 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1079 #define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
1080                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
1081     { \
1082         __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
1083         __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
1084 \
1085         _tA         = _mm256_loadu_ps(ptrA); \
1086         _tB         = _mm256_loadu_ps(ptrB); \
1087         _tC         = _mm256_loadu_ps(ptrC); \
1088         _tD         = _mm256_loadu_ps(ptrD); \
1089         _tE         = _mm256_loadu_ps(ptrE); \
1090         _tF         = _mm256_loadu_ps(ptrF); \
1091         _tG         = _mm256_loadu_ps(ptrG); \
1092         _tH         = _mm256_loadu_ps(ptrH); \
1093         _t1         = _mm256_unpacklo_ps(_x1, _y1); \
1094         _t2         = _mm256_unpackhi_ps(_x1, _y1); \
1095         _t3         = _mm256_unpacklo_ps(_z1, _x2); \
1096         _t4         = _mm256_unpackhi_ps(_z1, _x2); \
1097         _t5         = _mm256_unpacklo_ps(_y2, _z2); \
1098         _t6         = _mm256_unpackhi_ps(_y2, _z2); \
1099         _t7         = _mm256_unpacklo_ps(_x3, _y3); \
1100         _t8         = _mm256_unpackhi_ps(_x3, _y3); \
1101         _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1102         _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1103         _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1104         _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1105         _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
1106         _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
1107         _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
1108         _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
1109         _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
1110         _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
1111         _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
1112         _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
1113         _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
1114         _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
1115         _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
1116         _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
1117         _tA         = _mm256_sub_ps(_tA, _t5); \
1118         _tB         = _mm256_sub_ps(_tB, _t7); \
1119         _tC         = _mm256_sub_ps(_tC, _t1); \
1120         _tD         = _mm256_sub_ps(_tD, _t9); \
1121         _tE         = _mm256_sub_ps(_tE, _t6); \
1122         _tF         = _mm256_sub_ps(_tF, _t8); \
1123         _tG         = _mm256_sub_ps(_tG, _t2); \
1124         _tH         = _mm256_sub_ps(_tH, _t10); \
1125         _mm256_storeu_ps(ptrA, _tA); \
1126         _mm256_storeu_ps(ptrB, _tB); \
1127         _mm256_storeu_ps(ptrC, _tC); \
1128         _mm256_storeu_ps(ptrD, _tD); \
1129         _mm256_storeu_ps(ptrE, _tE); \
1130         _mm256_storeu_ps(ptrF, _tF); \
1131         _mm256_storeu_ps(ptrG, _tG); \
1132         _mm256_storeu_ps(ptrH, _tH); \
1133         _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
1134         _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
1135         _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
1136         _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
1137         _t1         = _mm256_unpacklo_ps(_z3, _x4); \
1138         _t2         = _mm256_unpackhi_ps(_z3, _x4); \
1139         _t3         = _mm256_unpacklo_ps(_y4, _z4); \
1140         _t4         = _mm256_unpackhi_ps(_y4, _z4); \
1141         _t5         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1142         _t6         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1143         _t7         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1144         _t8         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1145         _tI         = _mm256_sub_ps(_tI, _t5); \
1146         _tJ         = _mm256_sub_ps(_tJ, _t6); \
1147         _tK         = _mm256_sub_ps(_tK, _t7); \
1148         _tL         = _mm256_sub_ps(_tL, _t8); \
1149         _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
1150         _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
1151         _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
1152         _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
1153         _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
1154         _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
1155         _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
1156         _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
1157     }
1158 #else
1159 /* Real function for sane compilers */
1160 static gmx_inline void
1161 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
1162                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
1163                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
1164                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
1165                                           __m256 x1, __m256 y1, __m256 z1,
1166                                           __m256 x2, __m256 y2, __m256 z2,
1167                                           __m256 x3, __m256 y3, __m256 z3,
1168                                           __m256 x4, __m256 y4, __m256 z4)
1169 {
1170     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
1171     __m256 tA, tB, tC, tD, tE, tF, tG, tH;
1172     __m256 tI, tJ, tK, tL;
1173
1174     tA          = _mm256_loadu_ps(ptrA);
1175     tB          = _mm256_loadu_ps(ptrB);
1176     tC          = _mm256_loadu_ps(ptrC);
1177     tD          = _mm256_loadu_ps(ptrD);
1178     tE          = _mm256_loadu_ps(ptrE);
1179     tF          = _mm256_loadu_ps(ptrF);
1180     tG          = _mm256_loadu_ps(ptrG);
1181     tH          = _mm256_loadu_ps(ptrH);
1182
1183     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1184     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1185     t3          = _mm256_unpacklo_ps(z1, x2);                         /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1186     t4          = _mm256_unpackhi_ps(z1, x2);                         /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1187
1188     t5          = _mm256_unpacklo_ps(y2, z2);                         /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1189     t6          = _mm256_unpackhi_ps(y2, z2);                         /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1190     t7          = _mm256_unpacklo_ps(x3, y3);                         /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1191     t8          = _mm256_unpackhi_ps(x3, y3);                         /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1192
1193     t9          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1194     t10         = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1195     t11         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1196     t12         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1197
1198     t1          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1199     t2          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1200     t3          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1201     t4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1202
1203     t5          = gmx_mm256_unpack128lo_ps(t9, t1);                   /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1204     t6          = gmx_mm256_unpack128hi_ps(t9, t1);                   /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1205     t7          = gmx_mm256_unpack128lo_ps(t10, t2);                  /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1206     t8          = gmx_mm256_unpack128hi_ps(t10, t2);                  /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1207     t1          = gmx_mm256_unpack128lo_ps(t11, t3);                  /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1208     t2          = gmx_mm256_unpack128hi_ps(t11, t3);                  /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1209     t9          = gmx_mm256_unpack128lo_ps(t12, t4);                  /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1210     t10         = gmx_mm256_unpack128hi_ps(t12, t4);                  /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1211
1212     tA          = _mm256_sub_ps(tA, t5);
1213     tB          = _mm256_sub_ps(tB, t7);
1214     tC          = _mm256_sub_ps(tC, t1);
1215     tD          = _mm256_sub_ps(tD, t9);
1216     tE          = _mm256_sub_ps(tE, t6);
1217     tF          = _mm256_sub_ps(tF, t8);
1218     tG          = _mm256_sub_ps(tG, t2);
1219     tH          = _mm256_sub_ps(tH, t10);
1220
1221     _mm256_storeu_ps(ptrA, tA);
1222     _mm256_storeu_ps(ptrB, tB);
1223     _mm256_storeu_ps(ptrC, tC);
1224     _mm256_storeu_ps(ptrD, tD);
1225     _mm256_storeu_ps(ptrE, tE);
1226     _mm256_storeu_ps(ptrF, tF);
1227     _mm256_storeu_ps(ptrG, tG);
1228     _mm256_storeu_ps(ptrH, tH);
1229
1230     tI          = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8));
1231     tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8));
1232     tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8));
1233     tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8));
1234
1235     t1          = _mm256_unpacklo_ps(z3, x4);                         /* x4f z3f x4e z3e | x4b z3b x4a z3a */
1236     t2          = _mm256_unpackhi_ps(z3, x4);                         /* x4h z3h x4g z3g | x4d z3d x4c z3c */
1237     t3          = _mm256_unpacklo_ps(y4, z4);                         /* z4f y4f z4e y4e | z4b y4b z4a y4a */
1238     t4          = _mm256_unpackhi_ps(y4, z4);                         /* z4h y4h z4g y4g | z4d y4d z4c y4c */
1239
1240     t5          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
1241     t6          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
1242     t7          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
1243     t8          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
1244
1245     tI          = _mm256_sub_ps(tI, t5);
1246     tJ          = _mm256_sub_ps(tJ, t6);
1247     tK          = _mm256_sub_ps(tK, t7);
1248     tL          = _mm256_sub_ps(tL, t8);
1249
1250     _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(tI));
1251     _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(tJ));
1252     _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(tK));
1253     _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(tL));
1254     _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1255     _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1256     _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1257     _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1258 }
1259 #endif
1260
1261
1262 static gmx_inline void
1263 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1264                                          float * gmx_restrict fptr,
1265                                          float * gmx_restrict fshiftptr)
1266 {
1267     __m128 t1, t2, t3;
1268
1269     fix1 = _mm256_hadd_ps(fix1, fix1);
1270     fiy1 = _mm256_hadd_ps(fiy1, fiz1);
1271     fix1 = _mm256_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1272
1273     /* Add across the two lanes */
1274     t1   = _mm_add_ps(_mm256_castps256_ps128(fix1), _mm256_extractf128_ps(fix1, 0x1));
1275
1276     t2 = _mm_load_ss(fptr);
1277     t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
1278     t3 = _mm_load_ss(fshiftptr);
1279     t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
1280
1281     t2 = _mm_add_ps(t2, t1);
1282     t3 = _mm_add_ps(t3, t1);
1283
1284     _mm_store_ss(fptr, t2);
1285     _mm_storeh_pi((__m64 *)(fptr+1), t2);
1286     _mm_store_ss(fshiftptr, t3);
1287     _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
1288 }
1289
1290 #if defined (_MSC_VER) && defined(_M_IX86)
1291 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1292 #define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
1293                                                  fptr, fshiftptr) \
1294     { \
1295         __m256 _t1, _t2, _t3; \
1296         __m128 _tA, _tB, _tC; \
1297 \
1298         fix1 = _mm256_hadd_ps(fix1, fiy1); \
1299         fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1300         fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1301         fix3 = _mm256_hadd_ps(fix3, fiy3); \
1302         fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1303         fix1 = _mm256_hadd_ps(fix1, fiz1); \
1304         fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1305         fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1306 \
1307         _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1308         _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1309         _t1  = _mm256_add_ps(_t1, _t2); \
1310         _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1311         _t3  = _mm256_loadu_ps(fptr); \
1312         _t3  = _mm256_add_ps(_t3, _t1); \
1313         _mm256_storeu_ps(fptr, _t3); \
1314         _tB  = _mm_load_ss(fptr+8); \
1315         _tB  = _mm_add_ss(_tB, _tA); \
1316         _mm_store_ss(fptr+8, _tB); \
1317 \
1318         _tB  = _mm256_extractf128_ps(_t1, 0x1); \
1319         _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1320         _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1321         _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1322         _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1323         _tA  = _mm_add_ps(_tB, _tC); \
1324         _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1325         _tC  = _mm_loadu_ps(fshiftptr); \
1326         _tC  = _mm_add_ps(_tC, _tA); \
1327         _mm_storeu_ps(fshiftptr, _tC); \
1328     }
1329 #else
1330 /* Real function for sane compilers */
1331 static gmx_inline void
1332 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1333                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1334                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1335                                          float * gmx_restrict fptr,
1336                                          float * gmx_restrict fshiftptr)
1337 {
1338     __m256 t1, t2, t3;
1339     __m128 tA, tB, tC;
1340
1341     fix1 = _mm256_hadd_ps(fix1, fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1342     fiz1 = _mm256_hadd_ps(fiz1, fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1343     fiy2 = _mm256_hadd_ps(fiy2, fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1344     fix3 = _mm256_hadd_ps(fix3, fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1345     fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /*  0       0       Z3g+Z3h Z3e+Z3f | 0       0       Z3c+Z3d Z3a+Z3b */
1346
1347     fix1 = _mm256_hadd_ps(fix1, fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1348     fiy2 = _mm256_hadd_ps(fiy2, fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1349     fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /*  0       0       0       Z3e-h   | 0       0       0       Z3a-d   */
1350
1351     /* Add across the two lanes by swapping and adding back */
1352     t1   = gmx_mm256_unpack128lo_ps(fix1, fiy2);                                       /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1353     t2   = gmx_mm256_unpack128hi_ps(fix1, fiy2);                                       /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1354     t1   = _mm256_add_ps(t1, t2);                                                      /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1355
1356     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* 0 0 0 z3 */
1357
1358     t3   = _mm256_loadu_ps(fptr);
1359     t3   = _mm256_add_ps(t3, t1);
1360     _mm256_storeu_ps(fptr, t3);
1361     tB   = _mm_load_ss(fptr+8);
1362     tB   = _mm_add_ss(tB, tA);
1363     _mm_store_ss(fptr+8, tB);
1364
1365     /* Add up shift force */
1366     tB   = _mm256_extractf128_ps(t1, 0x1);                                          /* y3 x3 z2 y2 */
1367     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1368     tB   = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2));                         /* 0 z3 y3 x3 */
1369     tC   = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0));                             /*  - z2 y2 x2 */
1370
1371     tB   = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1372     tA   = _mm_add_ps(tB, tC);                      /*  - z y x */
1373
1374     tA   = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1375
1376     tC   = _mm_loadu_ps(fshiftptr);
1377     tC   = _mm_add_ps(tC, tA);
1378     _mm_storeu_ps(fshiftptr, tC);
1379 }
1380 #endif
1381
1382
1383 #if defined (_MSC_VER) && defined(_M_IX86)
1384 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1385 #define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1386                                                  fptr, fshiftptr) \
1387     { \
1388         __m256 _t1, _t2, _t3; \
1389         __m128 _tA, _tB, _tC; \
1390 \
1391         fix1 = _mm256_hadd_ps(fix1, fiy1); \
1392         fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1393         fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1394         fix3 = _mm256_hadd_ps(fix3, fiy3); \
1395         fiz3 = _mm256_hadd_ps(fiz3, fix4); \
1396         fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
1397 \
1398         fix1 = _mm256_hadd_ps(fix1, fiz1); \
1399         fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1400         fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
1401 \
1402         _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1403         _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1404         _t1  = _mm256_add_ps(_t1, _t2); \
1405         _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1406         _t3  = _mm256_loadu_ps(fptr); \
1407         _t3  = _mm256_add_ps(_t3, _t1); \
1408         _mm256_storeu_ps(fptr, _t3); \
1409         _tB  = _mm_loadu_ps(fptr+8); \
1410         _tB  = _mm_add_ps(_tB, _tA); \
1411         _mm_storeu_ps(fptr+8, _tB); \
1412 \
1413         _tB  = _mm256_extractf128_ps(_t1, 0x1); \
1414         _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1415         _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1416         _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1417         _tA  = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
1418         _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1419         _tA  = _mm_add_ps(_tA, _tC); \
1420         _tA  = _mm_add_ps(_tA, _tB); \
1421         _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1422         _tC  = _mm_loadu_ps(fshiftptr); \
1423         _tC  = _mm_add_ps(_tC, _tA); \
1424         _mm_storeu_ps(fshiftptr, _tC); \
1425     }
1426 #else
1427 /* Real function for sane compilers */
1428 static gmx_inline void
1429 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1430                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1431                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1432                                          __m256 fix4, __m256 fiy4, __m256 fiz4,
1433                                          float * gmx_restrict fptr,
1434                                          float * gmx_restrict fshiftptr)
1435 {
1436     __m256 t1, t2, t3;
1437     __m128 tA, tB, tC;
1438
1439     fix1 = _mm256_hadd_ps(fix1, fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1440     fiz1 = _mm256_hadd_ps(fiz1, fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1441     fiy2 = _mm256_hadd_ps(fiy2, fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1442     fix3 = _mm256_hadd_ps(fix3, fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1443     fiz3 = _mm256_hadd_ps(fiz3, fix4);                /*  X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1444     fiy4 = _mm256_hadd_ps(fiy4, fiz4);                /*  Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1445
1446     fix1 = _mm256_hadd_ps(fix1, fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1447     fiy2 = _mm256_hadd_ps(fiy2, fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1448     fiz3 = _mm256_hadd_ps(fiz3, fiy4);                /*  Z4e-h   Y4e-h   X4e-h   Z3e-h   | Z4a-d   Y4a-d   X4a-d   Z3a-d   */
1449
1450     /* Add across the two lanes by swapping and adding back */
1451     t1   = gmx_mm256_unpack128lo_ps(fix1, fiy2);                                       /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1452     t2   = gmx_mm256_unpack128hi_ps(fix1, fiy2);                                       /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1453     t1   = _mm256_add_ps(t1, t2);                                                      /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1454
1455     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* z4 y4 x4 z3 */
1456
1457     t3   = _mm256_loadu_ps(fptr);
1458     t3   = _mm256_add_ps(t3, t1);
1459     _mm256_storeu_ps(fptr, t3);
1460
1461     tB   = _mm_loadu_ps(fptr+8);
1462     tB   = _mm_add_ps(tB, tA);
1463     _mm_storeu_ps(fptr+8, tB);
1464
1465     /* Add up shift force */
1466     tB   = _mm256_extractf128_ps(t1, 0x1);                                          /* y3 x3 z2 y2 */
1467     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1468     tB   = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2));                         /* 0 z3 y3 x3 */
1469     tC   = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0));                             /*  - z2 y2 x2 */
1470     tA   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 3, 2, 1));                             /* - z4 y4 x4 */
1471
1472     tB   = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1473     tA   = _mm_add_ps(tA, tC);
1474     tA   = _mm_add_ps(tA, tB);
1475
1476     tA   = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1477
1478     tC   = _mm_loadu_ps(fshiftptr);
1479     tC   = _mm_add_ps(tC, tA);
1480     _mm_storeu_ps(fshiftptr, tC);
1481 }
1482 #endif
1483
1484
1485
1486 static gmx_inline void
1487 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1488 {
1489     __m128 t1;
1490
1491     pot1 = _mm256_hadd_ps(pot1, pot1);
1492     pot1 = _mm256_hadd_ps(pot1, pot1);
1493
1494     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1495
1496     _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1497 }
1498
1499 static gmx_inline void
1500 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1501                          __m256 pot2, float * gmx_restrict ptrB)
1502 {
1503     __m128 t1, t2;
1504
1505     pot1 = _mm256_hadd_ps(pot1, pot2);
1506     pot1 = _mm256_hadd_ps(pot1, pot1);
1507
1508     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1509
1510     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
1511     _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1512     _mm_store_ss(ptrB, _mm_add_ss(_mm_load_ss(ptrB), t2));
1513 }
1514
1515
1516 #endif /* _kernelutil_x86_avx_256_single_h_ */