src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_256_single_h_
  36 #define _kernelutil_x86_avx_256_single_h_
  37
  38 #include "gmx_x86_avx_256.h"
  39
  40 /* Transpose lower/upper half of 256-bit registers separately */
  41 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) {            \
  42         __m256 __tmp0, __tmp1, __tmp2, __tmp3;                               \
  43                                                                       \
  44         __tmp0   = _mm256_unpacklo_ps((ymm0), (ymm1));                     \
  45         __tmp1   = _mm256_unpacklo_ps((ymm2), (ymm3));                     \
  46         __tmp2   = _mm256_unpackhi_ps((ymm0), (ymm1));                     \
  47         __tmp3   = _mm256_unpackhi_ps((ymm2), (ymm3));                     \
  48         ymm0     = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(1, 0, 1, 0)); \
  49         ymm1     = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(3, 2, 3, 2)); \
  50         ymm2     = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(1, 0, 1, 0)); \
  51         ymm3     = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(3, 2, 3, 2)); \
  52 }
  53
  54
  55 static gmx_inline __m256
  56 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
  57 {
  58     return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
  59 }
  60
  61 /* Normal sum of four ymm registers */
  62 #define gmx_mm256_sum4_ps(t0, t1, t2, t3)  _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
  63
  64
  65 static gmx_inline int
  66 gmx_mm256_any_lt(__m256 a, __m256 b)
  67 {
  68     return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
  69 }
  70
  71
  72 static gmx_inline __m256
  73 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  74                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
  75 {
  76     __m128 t1, t2;
  77
  78     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
  79     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
  80     return _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t2));
  81 }
  82
  83
  84 static gmx_inline __m256
  85 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  86                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
  87                                 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
  88                                 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
  89 {
  90     __m256 t1, t2;
  91
  92     t1 = gmx_mm256_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
  93     t2 = gmx_mm256_load_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH);
  94
  95     return _mm256_permute2f128_ps(t1, t2, 0x20);
  96 }
  97
  98
  99
 100 static gmx_inline void
 101 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 102                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
 103 {
 104     __m256 t2, t3, t4;
 105
 106     t2       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
 107     t3       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
 108     t4       = _mm256_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
 109     _mm_store_ss(ptrA, _mm256_castps256_ps128(xmm1));
 110     _mm_store_ss(ptrB, _mm256_castps256_ps128(t2));
 111     _mm_store_ss(ptrC, _mm256_castps256_ps128(t3));
 112     _mm_store_ss(ptrD, _mm256_castps256_ps128(t4));
 113 }
 114
 115
 116 static gmx_inline void
 117 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 118                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 119                                  float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 120                                  float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
 121 {
 122     __m256 t1;
 123
 124     t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
 125
 126     gmx_mm256_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
 127     gmx_mm256_store_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
 128 }
 129
 130
 131 static gmx_inline void
 132 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 133                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 134                                      __m256 xmm1)
 135 {
 136     __m128 t1, t2, t3, t4;
 137
 138     t1   = _mm256_castps256_ps128(xmm1);
 139     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 140     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 141     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 142
 143     t1   = _mm_add_ss(t1, _mm_load_ss(ptrA));
 144     t2   = _mm_add_ss(t2, _mm_load_ss(ptrB));
 145     t3   = _mm_add_ss(t3, _mm_load_ss(ptrC));
 146     t4   = _mm_add_ss(t4, _mm_load_ss(ptrD));
 147
 148     _mm_store_ss(ptrA, t1);
 149     _mm_store_ss(ptrB, t2);
 150     _mm_store_ss(ptrC, t3);
 151     _mm_store_ss(ptrD, t4);
 152 }
 153
 154 static gmx_inline void
 155 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 156                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 157                                      float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 158                                      float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 159                                      __m256 xmm1)
 160 {
 161     __m256 t1;
 162
 163     t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
 164
 165     gmx_mm256_increment_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
 166     gmx_mm256_increment_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
 167 }
 168
 169
 170 static gmx_inline void
 171 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 172                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 173                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 174 {
 175     __m128 t1, t2, t3, t4;
 176
 177     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a  c6a */
 178     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b  c6b */
 179     t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c  c6c */
 180     t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d  c6d */
 181
 182     t1   = _mm_unpacklo_ps(t1, t2);                     /* c12b c12a c6b c6a */
 183     t3   = _mm_unpacklo_ps(t3, t4);                     /* c12d c12c c6d c6c */
 184
 185     *c6  = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)));
 186     *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
 187 }
 188
 189 static gmx_inline void
 190 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 191                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 192                                 const float * gmx_restrict p5, const float * gmx_restrict p6,
 193                                 const float * gmx_restrict p7, const float * gmx_restrict p8,
 194                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 195 {
 196     __m256 c6l, c6h, c12l, c12h;
 197
 198     gmx_mm256_load_4pair_swizzle_ps(p1, p2, p3, p4, &c6l, &c12l);
 199     gmx_mm256_load_4pair_swizzle_ps(p5, p6, p7, p8, &c6h, &c12h);
 200
 201     *c6  = _mm256_permute2f128_ps(c6l, c6h, 0x20);
 202     *c12 = _mm256_permute2f128_ps(c12l, c12h, 0x20);
 203 }
 204
 205
 206 static gmx_inline void
 207 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 208                                             const float * gmx_restrict xyz,
 209                                             __m256 * gmx_restrict      x1,
 210                                             __m256 * gmx_restrict      y1,
 211                                             __m256 * gmx_restrict      z1)
 212 {
 213     __m128 t1, t2, t3, t4;
 214
 215     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 216     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
 217     t3   = _mm_load_ss(xyz_shift+2);
 218     t4   = _mm_load_ss(xyz+2);
 219     t1   = _mm_add_ps(t1, t2);
 220     t3   = _mm_add_ss(t3, t4);
 221
 222     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 223     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 224     t3   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 225
 226     *x1  = gmx_mm256_set_m128(t1, t1);
 227     *y1  = gmx_mm256_set_m128(t2, t2);
 228     *z1  = gmx_mm256_set_m128(t3, t3);
 229 }
 230
 231
 232 static gmx_inline void
 233 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 234                                             const float * gmx_restrict xyz,
 235                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 236                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 237                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 238 {
 239     __m128 tA, tB;
 240     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9;
 241
 242     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 243     tB   = _mm_load_ss(xyz_shift+2);
 244
 245     t1   = _mm_loadu_ps(xyz);
 246     t2   = _mm_loadu_ps(xyz+4);
 247     t3   = _mm_load_ss(xyz+8);
 248
 249     tA   = _mm_movelh_ps(tA, tB);
 250     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 251     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 252     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 253
 254     t1   = _mm_add_ps(t1, t4);
 255     t2   = _mm_add_ps(t2, t5);
 256     t3   = _mm_add_ss(t3, t6);
 257
 258     t9   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 259     t8   = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 260     t7   = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 261     t6   = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 262     t5   = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 263     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 264     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 265     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 266     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 267
 268     *x1  = gmx_mm256_set_m128(t1, t1);
 269     *y1  = gmx_mm256_set_m128(t2, t2);
 270     *z1  = gmx_mm256_set_m128(t3, t3);
 271     *x2  = gmx_mm256_set_m128(t4, t4);
 272     *y2  = gmx_mm256_set_m128(t5, t5);
 273     *z2  = gmx_mm256_set_m128(t6, t6);
 274     *x3  = gmx_mm256_set_m128(t7, t7);
 275     *y3  = gmx_mm256_set_m128(t8, t8);
 276     *z3  = gmx_mm256_set_m128(t9, t9);
 277 }
 278
 279
 280 static gmx_inline void
 281 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 282                                             const float * gmx_restrict xyz,
 283                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 284                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 285                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 286                                             __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 287 {
 288     __m128 tA, tB;
 289     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 290
 291     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 292     tB   = _mm_load_ss(xyz_shift+2);
 293
 294     t1   = _mm_loadu_ps(xyz);
 295     t2   = _mm_loadu_ps(xyz+4);
 296     t3   = _mm_loadu_ps(xyz+8);
 297
 298     tA   = _mm_movelh_ps(tA, tB);
 299     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 300     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 301     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 302
 303     t1   = _mm_add_ps(t1, t4);
 304     t2   = _mm_add_ps(t2, t5);
 305     t3   = _mm_add_ps(t3, t6);
 306
 307     t12  = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
 308     t11  = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
 309     t10  = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
 310     t9   = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 311     t8   = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 312     t7   = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 313     t6   = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 314     t5   = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 315     t4   = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 316     t3   = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 317     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 318     t1   = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 319
 320     *x1  = gmx_mm256_set_m128(t1, t1);
 321     *y1  = gmx_mm256_set_m128(t2, t2);
 322     *z1  = gmx_mm256_set_m128(t3, t3);
 323     *x2  = gmx_mm256_set_m128(t4, t4);
 324     *y2  = gmx_mm256_set_m128(t5, t5);
 325     *z2  = gmx_mm256_set_m128(t6, t6);
 326     *x3  = gmx_mm256_set_m128(t7, t7);
 327     *y3  = gmx_mm256_set_m128(t8, t8);
 328     *z3  = gmx_mm256_set_m128(t9, t9);
 329     *x4  = gmx_mm256_set_m128(t10, t10);
 330     *y4  = gmx_mm256_set_m128(t11, t11);
 331     *z4  = gmx_mm256_set_m128(t12, t12);
 332 }
 333
 334
 335
 336 static gmx_inline void
 337 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 338                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 339                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 340 {
 341     __m128  t1, t2, t3, t4;
 342     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 343     t1             = gmx_mm_maskload_ps(ptrA, mask);
 344     t2             = gmx_mm_maskload_ps(ptrB, mask);
 345     t3             = gmx_mm_maskload_ps(ptrC, mask);
 346     t4             = gmx_mm_maskload_ps(ptrD, mask);
 347     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 348     *x1           = _mm256_castps128_ps256(t1);
 349     *y1           = _mm256_castps128_ps256(t2);
 350     *z1           = _mm256_castps128_ps256(t3);
 351 }
 352
 353
 354 static gmx_inline void
 355 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 356                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 357                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 358                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 359                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 360 {
 361     __m128 t1, t2, t3, t4;
 362     t1            = _mm_loadu_ps(ptrA);
 363     t2            = _mm_loadu_ps(ptrB);
 364     t3            = _mm_loadu_ps(ptrC);
 365     t4            = _mm_loadu_ps(ptrD);
 366     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 367     *x1           = _mm256_castps128_ps256(t1);
 368     *y1           = _mm256_castps128_ps256(t2);
 369     *z1           = _mm256_castps128_ps256(t3);
 370     *x2           = _mm256_castps128_ps256(t4);
 371     t1            = _mm_loadu_ps(ptrA+4);
 372     t2            = _mm_loadu_ps(ptrB+4);
 373     t3            = _mm_loadu_ps(ptrC+4);
 374     t4            = _mm_loadu_ps(ptrD+4);
 375     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 376     *y2           = _mm256_castps128_ps256(t1);
 377     *z2           = _mm256_castps128_ps256(t2);
 378     *x3           = _mm256_castps128_ps256(t3);
 379     *y3           = _mm256_castps128_ps256(t4);
 380     t1            = _mm_load_ss(ptrA+8);
 381     t2            = _mm_load_ss(ptrB+8);
 382     t3            = _mm_load_ss(ptrC+8);
 383     t4            = _mm_load_ss(ptrD+8);
 384     t1            = _mm_unpacklo_ps(t1, t3);
 385     t3            = _mm_unpacklo_ps(t2, t4);
 386     *z3           = _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t3));
 387 }
 388
 389
 390
 391 static gmx_inline void
 392 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 393                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 394                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 395                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 396                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 397                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 398 {
 399     __m128 t1, t2, t3, t4;
 400     t1            = _mm_loadu_ps(ptrA);
 401     t2            = _mm_loadu_ps(ptrB);
 402     t3            = _mm_loadu_ps(ptrC);
 403     t4            = _mm_loadu_ps(ptrD);
 404     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 405     *x1           = _mm256_castps128_ps256(t1);
 406     *y1           = _mm256_castps128_ps256(t2);
 407     *z1           = _mm256_castps128_ps256(t3);
 408     *x2           = _mm256_castps128_ps256(t4);
 409     t1            = _mm_loadu_ps(ptrA+4);
 410     t2            = _mm_loadu_ps(ptrB+4);
 411     t3            = _mm_loadu_ps(ptrC+4);
 412     t4            = _mm_loadu_ps(ptrD+4);
 413     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 414     *y2           = _mm256_castps128_ps256(t1);
 415     *z2           = _mm256_castps128_ps256(t2);
 416     *x3           = _mm256_castps128_ps256(t3);
 417     *y3           = _mm256_castps128_ps256(t4);
 418     t1            = _mm_loadu_ps(ptrA+8);
 419     t2            = _mm_loadu_ps(ptrB+8);
 420     t3            = _mm_loadu_ps(ptrC+8);
 421     t4            = _mm_loadu_ps(ptrD+8);
 422     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 423     *z3           = _mm256_castps128_ps256(t1);
 424     *x4           = _mm256_castps128_ps256(t2);
 425     *y4           = _mm256_castps128_ps256(t3);
 426     *z4           = _mm256_castps128_ps256(t4);
 427 }
 428
 429
 430 static gmx_inline void
 431 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 432                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 433                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 434                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 435                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 436 {
 437     __m256  t1, t2, t3, t4, t5, t6, t7, t8;
 438     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 439
 440     t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask)); /*  - zE yE xE |  - zA yA xA */
 441     t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask)); /*  - zF yF xF |  - zB yB xB */
 442     t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask)); /*  - zG yG xG |  - zC yC xC */
 443     t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask)); /*  - zH yH xH |  - zD yD xD */
 444
 445     t5            = _mm256_unpacklo_ps(t1, t2);                                                          /* yF yE xF xE | yB yA xB xA */
 446     t6            = _mm256_unpacklo_ps(t3, t4);                                                          /* yH yG xH xG | yD yC xD xC */
 447     t7            = _mm256_unpackhi_ps(t1, t2);                                                          /*  -  - zF zE |  -  - zB zA */
 448     t8            = _mm256_unpackhi_ps(t3, t4);                                                          /*  -  - zH zG |  -  - zD zC */
 449
 450     *x1           = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(1, 0, 1, 0));
 451     *y1           = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(3, 2, 3, 2));
 452     *z1           = _mm256_shuffle_ps(t7, t8, _MM_SHUFFLE(1, 0, 1, 0));
 453 }
 454
 455
 456 static gmx_inline void
 457 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 458                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 459                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 460                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 461                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 462                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 463                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 464 {
 465     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 466
 467     t1           = _mm256_loadu_ps(ptrA);                                /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 468     t2           = _mm256_loadu_ps(ptrB);                                /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 469     t3           = _mm256_loadu_ps(ptrC);                                /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 470     t4           = _mm256_loadu_ps(ptrD);                                /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 471     t5           = _mm256_loadu_ps(ptrE);                                /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 472     t6           = _mm256_loadu_ps(ptrF);                                /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 473     t7           = _mm256_loadu_ps(ptrG);                                /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 474     t8           = _mm256_loadu_ps(ptrH);                                /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 475
 476     t9           = _mm256_unpacklo_ps(t1, t2);                           /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 477     t10          = _mm256_unpackhi_ps(t1, t2);                           /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 478     t11          = _mm256_unpacklo_ps(t3, t4);                           /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 479     t12          = _mm256_unpackhi_ps(t3, t4);                           /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 480     t1           = _mm256_unpacklo_ps(t5, t6);                           /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 481     t2           = _mm256_unpackhi_ps(t5, t6);                           /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 482     t3           = _mm256_unpacklo_ps(t7, t8);                           /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 483     t4           = _mm256_unpackhi_ps(t7, t8);                           /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 484
 485     t5           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0));  /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 486     t6           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2));  /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 487     t7           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 488     t8           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 489
 490     t9           = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));   /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 491     t10          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));   /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 492     t11          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0));   /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 493     t12          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2));   /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 494
 495     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 496     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 497     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 498     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 499
 500     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 501     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 502     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 503     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 504
 505     t1           = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
 506     t2           = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
 507     t3           = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
 508     t4           = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
 509
 510     t1           = _mm256_unpacklo_ps(t1, t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
 511     t2           = _mm256_unpacklo_ps(t2, t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
 512
 513     *z3          = _mm256_unpacklo_ps(t1, t2);
 514 }
 515
 516
 517
 518 static gmx_inline void
 519 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 520                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 521                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 522                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 523                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 524                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 525                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 526                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 527 {
 528     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 529
 530     t1           = _mm256_loadu_ps(ptrA);                                /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 531     t2           = _mm256_loadu_ps(ptrB);                                /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 532     t3           = _mm256_loadu_ps(ptrC);                                /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 533     t4           = _mm256_loadu_ps(ptrD);                                /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 534     t5           = _mm256_loadu_ps(ptrE);                                /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 535     t6           = _mm256_loadu_ps(ptrF);                                /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 536     t7           = _mm256_loadu_ps(ptrG);                                /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 537     t8           = _mm256_loadu_ps(ptrH);                                /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 538
 539     t9           = _mm256_unpacklo_ps(t1, t2);                           /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 540     t10          = _mm256_unpackhi_ps(t1, t2);                           /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 541     t11          = _mm256_unpacklo_ps(t3, t4);                           /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 542     t12          = _mm256_unpackhi_ps(t3, t4);                           /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 543     t1           = _mm256_unpacklo_ps(t5, t6);                           /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 544     t2           = _mm256_unpackhi_ps(t5, t6);                           /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 545     t3           = _mm256_unpacklo_ps(t7, t8);                           /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 546     t4           = _mm256_unpackhi_ps(t7, t8);                           /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 547
 548     t5           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0));  /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 549     t6           = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2));  /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 550     t7           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 551     t8           = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 552     t9           = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));   /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 553     t10          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));   /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 554     t11          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0));   /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 555     t12          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2));   /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 556
 557     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 558     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 559     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 560     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 561
 562     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 563     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 564     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 565     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 566
 567     t1           = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
 568     t2           = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
 569     t3           = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
 570     t4           = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
 571
 572     t5           = _mm256_unpacklo_ps(t1, t2);                                     /* x4f x4e z3f z3e | x4b x4a z3b z3a */
 573     t6           = _mm256_unpackhi_ps(t1, t2);                                     /* z4f z4e y4f y4e | z4b z4a y4b y4a */
 574     t7           = _mm256_unpacklo_ps(t3, t4);                                     /* x4h x4g z3h z3g | x4d x4c z3d z3c */
 575     t8           = _mm256_unpackhi_ps(t3, t4);                                     /* z4h z4g y4h y4g | z4d z4c y4d y4c */
 576
 577     *z3          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));             /* z3h z3g z3f z3e | z3d z3c z3b z3a */
 578     *x4          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));             /* x4h x4g x4f x4e | x4d x4c x4b x4a */
 579     *y4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0));             /* y4h y4g y4f y4e | y4d y4c y4b y4a */
 580     *z4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2));             /* z4h z4g z4f z4e | z4d z4c z4b z4a */
 581 }
 582
 583
 584 static gmx_inline void
 585 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 586                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 587                                           __m256 x1, __m256 y1, __m256 z1)
 588 {
 589     __m128  t1, t2, t3, t4, t5, t6, t7, t8;
 590     __m128i mask;
 591
 592     /* Construct a mask without executing any data loads */
 593     mask        = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
 594
 595     t3          = _mm_unpacklo_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
 596     t4          = _mm_unpackhi_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
 597
 598     t1          = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 0, 1, 0)); /*  -  z1a y1a x1a */
 599     t2          = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 1, 3, 2)); /*  -  z1b y1b x1b */
 600     t3          = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 2, 1, 0)); /*  -  z1c y1c x1c */
 601     t4          = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 3, 3, 2)); /*  -  z1d y1d x1d */
 602
 603     t5          = gmx_mm_maskload_ps(ptrA, mask);
 604     t6          = gmx_mm_maskload_ps(ptrB, mask);
 605     t7          = gmx_mm_maskload_ps(ptrC, mask);
 606     t8          = gmx_mm_maskload_ps(ptrD, mask);
 607
 608     t5          = _mm_sub_ps(t5, t1);
 609     t6          = _mm_sub_ps(t6, t2);
 610     t7          = _mm_sub_ps(t7, t3);
 611     t8          = _mm_sub_ps(t8, t4);
 612
 613     gmx_mm_maskstore_ps(ptrA, mask, t5);
 614     gmx_mm_maskstore_ps(ptrB, mask, t6);
 615     gmx_mm_maskstore_ps(ptrC, mask, t7);
 616     gmx_mm_maskstore_ps(ptrD, mask, t8);
 617 }
 618
 619 #if defined (_MSC_VER) && defined(_M_IX86)
 620 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 621 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 622                                                   x1, y1, z1, x2, y2, z2, x3, y3, z3) \
 623     { \
 624         __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
 625         __m128 _tA, _tB, _tC, _tD; \
 626 \
 627         _t1         = _mm256_loadu_ps(ptrA); \
 628         _t2         = _mm256_loadu_ps(ptrB); \
 629         _t3         = _mm256_loadu_ps(ptrC); \
 630         _t4         = _mm256_loadu_ps(ptrD); \
 631         _tA         = _mm_load_ss(ptrA+8); \
 632         _tB         = _mm_load_ss(ptrB+8); \
 633         _tC         = _mm_load_ss(ptrC+8); \
 634         _tD         = _mm_load_ss(ptrD+8); \
 635         _t5         = _mm256_unpacklo_ps(x1, y1); \
 636         x1          = _mm256_unpackhi_ps(x1, y1); \
 637         y1          = _mm256_unpacklo_ps(z1, x2); \
 638         z1          = _mm256_unpackhi_ps(z1, x2); \
 639         x2          = _mm256_unpacklo_ps(y2, z2); \
 640         y2          = _mm256_unpackhi_ps(y2, z2); \
 641         _t6         = _mm256_unpacklo_ps(x3, y3); \
 642         x3          = _mm256_unpackhi_ps(x3, y3); \
 643         _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
 644         x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
 645         y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
 646         z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
 647         z2          = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
 648         _t5         = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
 649         y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
 650         x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
 651         _t1         = _mm256_sub_ps(_t1, z2); \
 652         _t2         = _mm256_sub_ps(_t2, _t5); \
 653         _t3         = _mm256_sub_ps(_t3, y1); \
 654         _t4         = _mm256_sub_ps(_t4, x1); \
 655         _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
 656         _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
 657         _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
 658         _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
 659         _mm256_storeu_ps(ptrA, _t1); \
 660         _mm256_storeu_ps(ptrB, _t2); \
 661         _mm256_storeu_ps(ptrC, _t3); \
 662         _mm256_storeu_ps(ptrD, _t4); \
 663         _mm_store_ss(ptrA+8, _tA); \
 664         _mm_store_ss(ptrB+8, _tB); \
 665         _mm_store_ss(ptrC+8, _tC); \
 666         _mm_store_ss(ptrD+8, _tD); \
 667     }
 668 #else
 669 /* Real function for sane compilers */
 670 static gmx_inline void
 671 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 672                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 673                                           __m256 x1, __m256 y1, __m256 z1,
 674                                           __m256 x2, __m256 y2, __m256 z2,
 675                                           __m256 x3, __m256 y3, __m256 z3)
 676 {
 677     __m256 t1, t2, t3, t4, t5, t6;
 678     __m128 tA, tB, tC, tD;
 679
 680     t1          = _mm256_loadu_ps(ptrA);
 681     t2          = _mm256_loadu_ps(ptrB);
 682     t3          = _mm256_loadu_ps(ptrC);
 683     t4          = _mm256_loadu_ps(ptrD);
 684     tA          = _mm_load_ss(ptrA+8);
 685     tB          = _mm_load_ss(ptrB+8);
 686     tC          = _mm_load_ss(ptrC+8);
 687     tD          = _mm_load_ss(ptrD+8);
 688
 689     t5          = _mm256_unpacklo_ps(x1, y1);                                /* - - - - | y1b x1b y1a x1a */
 690     x1          = _mm256_unpackhi_ps(x1, y1);                                /* - - - - | y1d x1d y1c x1c */
 691     y1          = _mm256_unpacklo_ps(z1, x2);                                /* - - - - | x2b z1b x2a z1a */
 692     z1          = _mm256_unpackhi_ps(z1, x2);                                /* - - - - | x2d z1d x2c z1c */
 693
 694     x2          = _mm256_unpacklo_ps(y2, z2);                                /* - - - - | z2b y2b z2a y2a */
 695     y2          = _mm256_unpackhi_ps(y2, z2);                                /* - - - - | z2d y2d z2c y2c */
 696     t6          = _mm256_unpacklo_ps(x3, y3);                                /* - - - - | y3b x3b y3a x3a */
 697     x3          = _mm256_unpackhi_ps(x3, y3);                                /* - - - - | y3d x3d y3c x3c */
 698
 699     t5          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 700     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 701
 702     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 703     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 704
 705     z2          = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(1, 0, 1, 0));        /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 706     t5          = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(3, 2, 3, 2));        /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 707     y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0));        /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 708     x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2));        /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 709
 710     t1          = _mm256_sub_ps(t1, z2);
 711     t2          = _mm256_sub_ps(t2, t5);
 712     t3          = _mm256_sub_ps(t3, y1);
 713     t4          = _mm256_sub_ps(t4, x1);
 714
 715     tA          = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
 716     tB          = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1)));
 717     tC          = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2)));
 718     tD          = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3)));
 719
 720     /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
 721     _mm256_storeu_ps(ptrA, t1);
 722     _mm256_storeu_ps(ptrB, t2);
 723     _mm256_storeu_ps(ptrC, t3);
 724     _mm256_storeu_ps(ptrD, t4);
 725     _mm_store_ss(ptrA+8, tA);
 726     _mm_store_ss(ptrB+8, tB);
 727     _mm_store_ss(ptrC+8, tC);
 728     _mm_store_ss(ptrD+8, tD);
 729 }
 730 #endif
 731
 732
 733
 734 #if defined (_MSC_VER) && defined(_M_IX86)
 735 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 736 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 737                                                   x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
 738     { \
 739         __m256 _t1, _t2, _t3, _t4, _t5; \
 740         __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
 741 \
 742         _t1         = _mm256_loadu_ps(ptrA); \
 743         _t2         = _mm256_loadu_ps(ptrB); \
 744         _t3         = _mm256_loadu_ps(ptrC); \
 745         _t4         = _mm256_loadu_ps(ptrD); \
 746         _tA         = _mm_loadu_ps(ptrA+8); \
 747         _tB         = _mm_loadu_ps(ptrB+8); \
 748         _tC         = _mm_loadu_ps(ptrC+8); \
 749         _tD         = _mm_loadu_ps(ptrD+8); \
 750         _t5         = _mm256_unpacklo_ps(x1, y1); \
 751         x1          = _mm256_unpackhi_ps(x1, y1); \
 752         y1          = _mm256_unpacklo_ps(z1, x2); \
 753         z1          = _mm256_unpackhi_ps(z1, x2); \
 754         x2          = _mm256_unpacklo_ps(y2, z2); \
 755         y2          = _mm256_unpackhi_ps(y2, z2); \
 756         z2          = _mm256_unpacklo_ps(x3, y3); \
 757         x3          = _mm256_unpackhi_ps(x3, y3); \
 758         y3          = _mm256_unpacklo_ps(z3, x4); \
 759         z3          = _mm256_unpackhi_ps(z3, x4); \
 760         x4          = _mm256_unpacklo_ps(y4, z4); \
 761         y4          = _mm256_unpackhi_ps(y4, z4); \
 762         x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
 763         x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
 764         y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
 765         z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
 766         z2          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
 767         _t5         = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
 768         y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
 769         x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
 770         _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
 771         _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
 772         _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
 773         _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
 774         _t1         = _mm256_sub_ps(_t1, z2); \
 775         _t2         = _mm256_sub_ps(_t2, _t5); \
 776         _t3         = _mm256_sub_ps(_t3, y1); \
 777         _t4         = _mm256_sub_ps(_t4, x1); \
 778         _tA         = _mm_sub_ps(_tA, _tE); \
 779         _tB         = _mm_sub_ps(_tB, _tF); \
 780         _tC         = _mm_sub_ps(_tC, _tG); \
 781         _tD         = _mm_sub_ps(_tD, _tH); \
 782         _mm256_storeu_ps(ptrA, _t1); \
 783         _mm256_storeu_ps(ptrB, _t2); \
 784         _mm256_storeu_ps(ptrC, _t3); \
 785         _mm256_storeu_ps(ptrD, _t4); \
 786         _mm_storeu_ps(ptrA+8, _tA); \
 787         _mm_storeu_ps(ptrB+8, _tB); \
 788         _mm_storeu_ps(ptrC+8, _tC); \
 789         _mm_storeu_ps(ptrD+8, _tD); \
 790     }
 791 #else
 792 /* Real function for sane compilers */
 793 static gmx_inline void
 794 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 795                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 796                                           __m256 x1, __m256 y1, __m256 z1,
 797                                           __m256 x2, __m256 y2, __m256 z2,
 798                                           __m256 x3, __m256 y3, __m256 z3,
 799                                           __m256 x4, __m256 y4, __m256 z4)
 800 {
 801     __m256 t1, t2, t3, t4, t5;
 802     __m128 tA, tB, tC, tD, tE, tF, tG, tH;
 803
 804     t1          = _mm256_loadu_ps(ptrA);
 805     t2          = _mm256_loadu_ps(ptrB);
 806     t3          = _mm256_loadu_ps(ptrC);
 807     t4          = _mm256_loadu_ps(ptrD);
 808     tA          = _mm_loadu_ps(ptrA+8);
 809     tB          = _mm_loadu_ps(ptrB+8);
 810     tC          = _mm_loadu_ps(ptrC+8);
 811     tD          = _mm_loadu_ps(ptrD+8);
 812
 813     t5          = _mm256_unpacklo_ps(x1, y1);                                                                      /* - - - - | y1b x1b y1a x1a */
 814     x1          = _mm256_unpackhi_ps(x1, y1);                                                                      /* - - - - | y1d x1d y1c x1c */
 815     y1          = _mm256_unpacklo_ps(z1, x2);                                                                      /* - - - - | x2b z1b x2a z1a */
 816     z1          = _mm256_unpackhi_ps(z1, x2);                                                                      /* - - - - | x2d z1d x2c z1c */
 817
 818     x2          = _mm256_unpacklo_ps(y2, z2);                                                                      /* - - - - | z2b y2b z2a y2a */
 819     y2          = _mm256_unpackhi_ps(y2, z2);                                                                      /* - - - - | z2d y2d z2c y2c */
 820     z2          = _mm256_unpacklo_ps(x3, y3);                                                                      /* - - - - | y3b x3b y3a x3a */
 821     x3          = _mm256_unpackhi_ps(x3, y3);                                                                      /* - - - - | y3d x3d y3c x3c */
 822
 823     y3          = _mm256_unpacklo_ps(z3, x4);                                                                      /* - - - - | x4b z3b x4a z3a */
 824     z3          = _mm256_unpackhi_ps(z3, x4);                                                                      /* - - - - | x4d z3d x4c z3c */
 825     x4          = _mm256_unpacklo_ps(y4, z4);                                                                      /* - - - - | z4b y4b z4a y4a */
 826     y4          = _mm256_unpackhi_ps(y4, z4);                                                                      /* - - - - | z4d y4d z4c y4c */
 827
 828     x2          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1);                                       /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 829     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);                                       /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 830     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);                                       /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 831     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);                                       /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 832
 833     z2          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0));                                              /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 834     t5          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2));                                              /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 835     y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0));                                              /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 836     x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2));                                              /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 837
 838     tE          = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4a y4a x4a z3a */
 839     tF          = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4b y4b x4b z3b */
 840
 841     tG          = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4c y4c x4c z3c */
 842     tH          = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4d y4d x4d z3d */
 843
 844     t1          = _mm256_sub_ps(t1, z2);
 845     t2          = _mm256_sub_ps(t2, t5);
 846     t3          = _mm256_sub_ps(t3, y1);
 847     t4          = _mm256_sub_ps(t4, x1);
 848
 849     tA          = _mm_sub_ps(tA, tE);
 850     tB          = _mm_sub_ps(tB, tF);
 851     tC          = _mm_sub_ps(tC, tG);
 852     tD          = _mm_sub_ps(tD, tH);
 853
 854     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 855     _mm256_storeu_ps(ptrA, t1);
 856     _mm256_storeu_ps(ptrB, t2);
 857     _mm256_storeu_ps(ptrC, t3);
 858     _mm256_storeu_ps(ptrD, t4);
 859     _mm_storeu_ps(ptrA+8, tA);
 860     _mm_storeu_ps(ptrB+8, tB);
 861     _mm_storeu_ps(ptrC+8, tC);
 862     _mm_storeu_ps(ptrD+8, tD);
 863 }
 864 #endif
 865
 866
 867 static gmx_inline void
 868 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 869                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 870                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 871                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 872                                           __m256 x1, __m256 y1, __m256 z1)
 873 {
 874     __m256  t1, t2, t3, t4, t5, t6;
 875     __m256  tA, tB, tC, tD;
 876     __m128i mask;
 877
 878     /* Construct a mask without executing any data loads */
 879     mask        = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
 880
 881     tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask));
 882     tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask));
 883     tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask));
 884     tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask));
 885     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
 886     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 887
 888     t3          = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 0, 1, 0)); /*  -  z1e y1e x1e |  - z1a y1a x1a */
 889     t4          = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 1, 3, 2)); /*  -  z1f y1f x1f |  - z1b y1b x1b */
 890     t5          = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 2, 1, 0)); /*  -  z1g y1g x1g |  - z1c y1c x1c */
 891     t6          = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 3, 3, 2)); /*  -  z1h y1h x1h |  - z1d y1d x1d */
 892
 893     tA          = _mm256_sub_ps(tA, t3);
 894     tB          = _mm256_sub_ps(tB, t4);
 895     tC          = _mm256_sub_ps(tC, t5);
 896     tD          = _mm256_sub_ps(tD, t6);
 897
 898     gmx_mm_maskstore_ps(ptrA, mask, _mm256_castps256_ps128(tA));
 899     gmx_mm_maskstore_ps(ptrB, mask, _mm256_castps256_ps128(tB));
 900     gmx_mm_maskstore_ps(ptrC, mask, _mm256_castps256_ps128(tC));
 901     gmx_mm_maskstore_ps(ptrD, mask, _mm256_castps256_ps128(tD));
 902     gmx_mm_maskstore_ps(ptrE, mask, _mm256_extractf128_ps(tA, 0x1));
 903     gmx_mm_maskstore_ps(ptrF, mask, _mm256_extractf128_ps(tB, 0x1));
 904     gmx_mm_maskstore_ps(ptrG, mask, _mm256_extractf128_ps(tC, 0x1));
 905     gmx_mm_maskstore_ps(ptrH, mask, _mm256_extractf128_ps(tD, 0x1));
 906 }
 907
 908
 909
 910 #if defined (_MSC_VER) && defined(_M_IX86)
 911 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 912 #define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 913     { \
 914         __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
 915         __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
 916 \
 917         _tA         = _mm256_loadu_ps(ptrA); \
 918         _tB         = _mm256_loadu_ps(ptrB); \
 919         _tC         = _mm256_loadu_ps(ptrC); \
 920         _tD         = _mm256_loadu_ps(ptrD); \
 921         _tE         = _mm256_loadu_ps(ptrE); \
 922         _tF         = _mm256_loadu_ps(ptrF); \
 923         _tG         = _mm256_loadu_ps(ptrG); \
 924         _tH         = _mm256_loadu_ps(ptrH); \
 925         _t1         = _mm256_unpacklo_ps(_x1, _y1); \
 926         _t2         = _mm256_unpackhi_ps(_x1, _y1); \
 927         _t3         = _mm256_unpacklo_ps(_z1, _x2); \
 928         _t4         = _mm256_unpackhi_ps(_z1, _x2); \
 929         _t5         = _mm256_unpacklo_ps(_y2, _z2); \
 930         _t6         = _mm256_unpackhi_ps(_y2, _z2); \
 931         _t7         = _mm256_unpacklo_ps(_x3, _y3); \
 932         _t8         = _mm256_unpackhi_ps(_x3, _y3); \
 933         _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
 934         _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
 935         _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
 936         _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
 937         _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
 938         _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
 939         _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
 940         _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
 941         _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
 942         _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
 943         _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
 944         _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
 945         _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
 946         _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
 947         _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
 948         _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
 949         _tA         = _mm256_sub_ps(_tA, _t5); \
 950         _tB         = _mm256_sub_ps(_tB, _t7); \
 951         _tC         = _mm256_sub_ps(_tC, _t1); \
 952         _tD         = _mm256_sub_ps(_tD, _t9); \
 953         _tE         = _mm256_sub_ps(_tE, _t6); \
 954         _tF         = _mm256_sub_ps(_tF, _t8); \
 955         _tG         = _mm256_sub_ps(_tG, _t2); \
 956         _tH         = _mm256_sub_ps(_tH, _t10); \
 957         _mm256_storeu_ps(ptrA, _tA); \
 958         _mm256_storeu_ps(ptrB, _tB); \
 959         _mm256_storeu_ps(ptrC, _tC); \
 960         _mm256_storeu_ps(ptrD, _tD); \
 961         _mm256_storeu_ps(ptrE, _tE); \
 962         _mm256_storeu_ps(ptrF, _tF); \
 963         _mm256_storeu_ps(ptrG, _tG); \
 964         _mm256_storeu_ps(ptrH, _tH); \
 965         _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
 966         _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
 967         _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
 968         _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
 969         _tI         = _mm256_unpacklo_ps(_tI, _tK); \
 970         _tJ         = _mm256_unpacklo_ps(_tJ, _tL); \
 971         _tI         = _mm256_unpacklo_ps(_tI, _tJ); \
 972         _tI         = _mm256_sub_ps(_tI, _z3); \
 973         _tJ         = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
 974         _tK         = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
 975         _tL         = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
 976         _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
 977         _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
 978         _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
 979         _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
 980         _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
 981         _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
 982         _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
 983         _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
 984     }
 985 #else
 986 /* Real function for sane compilers */
 987 static gmx_inline void
 988 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 989                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 990                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 991                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 992                                           __m256 x1, __m256 y1, __m256 z1,
 993                                           __m256 x2, __m256 y2, __m256 z2,
 994                                           __m256 x3, __m256 y3, __m256 z3)
 995 {
 996     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 997     __m256 tA, tB, tC, tD, tE, tF, tG, tH;
 998     __m256 tI, tJ, tK, tL;
 999
1000     tA          = _mm256_loadu_ps(ptrA);
1001     tB          = _mm256_loadu_ps(ptrB);
1002     tC          = _mm256_loadu_ps(ptrC);
1003     tD          = _mm256_loadu_ps(ptrD);
1004     tE          = _mm256_loadu_ps(ptrE);
1005     tF          = _mm256_loadu_ps(ptrF);
1006     tG          = _mm256_loadu_ps(ptrG);
1007     tH          = _mm256_loadu_ps(ptrH);
1008
1009     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1010     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1011     t3          = _mm256_unpacklo_ps(z1, x2);                         /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1012     t4          = _mm256_unpackhi_ps(z1, x2);                         /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1013
1014     t5          = _mm256_unpacklo_ps(y2, z2);                         /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1015     t6          = _mm256_unpackhi_ps(y2, z2);                         /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1016     t7          = _mm256_unpacklo_ps(x3, y3);                         /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1017     t8          = _mm256_unpackhi_ps(x3, y3);                         /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1018
1019     t9          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1020     t10         = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1021     t11         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1022     t12         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1023
1024     t1          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1025     t2          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1026     t3          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1027     t4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1028
1029     t5          = gmx_mm256_unpack128lo_ps(t9, t1);                   /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1030     t6          = gmx_mm256_unpack128hi_ps(t9, t1);                   /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1031     t7          = gmx_mm256_unpack128lo_ps(t10, t2);                  /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1032     t8          = gmx_mm256_unpack128hi_ps(t10, t2);                  /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1033     t1          = gmx_mm256_unpack128lo_ps(t11, t3);                  /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1034     t2          = gmx_mm256_unpack128hi_ps(t11, t3);                  /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1035     t9          = gmx_mm256_unpack128lo_ps(t12, t4);                  /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1036     t10         = gmx_mm256_unpack128hi_ps(t12, t4);                  /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1037
1038     tA          = _mm256_sub_ps(tA, t5);
1039     tB          = _mm256_sub_ps(tB, t7);
1040     tC          = _mm256_sub_ps(tC, t1);
1041     tD          = _mm256_sub_ps(tD, t9);
1042     tE          = _mm256_sub_ps(tE, t6);
1043     tF          = _mm256_sub_ps(tF, t8);
1044     tG          = _mm256_sub_ps(tG, t2);
1045     tH          = _mm256_sub_ps(tH, t10);
1046
1047     _mm256_storeu_ps(ptrA, tA);
1048     _mm256_storeu_ps(ptrB, tB);
1049     _mm256_storeu_ps(ptrC, tC);
1050     _mm256_storeu_ps(ptrD, tD);
1051     _mm256_storeu_ps(ptrE, tE);
1052     _mm256_storeu_ps(ptrF, tF);
1053     _mm256_storeu_ps(ptrG, tG);
1054     _mm256_storeu_ps(ptrH, tH);
1055
1056     tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
1057     tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
1058     tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
1059     tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
1060
1061     tI          = _mm256_unpacklo_ps(tI, tK);  /*  -  - zG zE |  -  - zC zA */
1062     tJ          = _mm256_unpacklo_ps(tJ, tL);  /*  -  - zH zF |  -  - zD zB */
1063     tI          = _mm256_unpacklo_ps(tI, tJ);  /* zH zG zF zE | zD zC zB zA */
1064
1065     tI          = _mm256_sub_ps(tI, z3);
1066     tJ          = _mm256_permute_ps(tI, _MM_SHUFFLE(1, 1, 1, 1));
1067     tK          = _mm256_permute_ps(tI, _MM_SHUFFLE(2, 2, 2, 2));
1068     tL          = _mm256_permute_ps(tI, _MM_SHUFFLE(3, 3, 3, 3));
1069
1070     _mm_store_ss(ptrA+8, _mm256_castps256_ps128(tI));
1071     _mm_store_ss(ptrB+8, _mm256_castps256_ps128(tJ));
1072     _mm_store_ss(ptrC+8, _mm256_castps256_ps128(tK));
1073     _mm_store_ss(ptrD+8, _mm256_castps256_ps128(tL));
1074     _mm_store_ss(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1075     _mm_store_ss(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1076     _mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1077     _mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1078 }
1079 #endif
1080
1081
1082
1083 #if defined (_MSC_VER) && defined(_M_IX86)
1084 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1085 #define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
1086                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
1087     { \
1088         __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
1089         __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
1090 \
1091         _tA         = _mm256_loadu_ps(ptrA); \
1092         _tB         = _mm256_loadu_ps(ptrB); \
1093         _tC         = _mm256_loadu_ps(ptrC); \
1094         _tD         = _mm256_loadu_ps(ptrD); \
1095         _tE         = _mm256_loadu_ps(ptrE); \
1096         _tF         = _mm256_loadu_ps(ptrF); \
1097         _tG         = _mm256_loadu_ps(ptrG); \
1098         _tH         = _mm256_loadu_ps(ptrH); \
1099         _t1         = _mm256_unpacklo_ps(_x1, _y1); \
1100         _t2         = _mm256_unpackhi_ps(_x1, _y1); \
1101         _t3         = _mm256_unpacklo_ps(_z1, _x2); \
1102         _t4         = _mm256_unpackhi_ps(_z1, _x2); \
1103         _t5         = _mm256_unpacklo_ps(_y2, _z2); \
1104         _t6         = _mm256_unpackhi_ps(_y2, _z2); \
1105         _t7         = _mm256_unpacklo_ps(_x3, _y3); \
1106         _t8         = _mm256_unpackhi_ps(_x3, _y3); \
1107         _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1108         _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1109         _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1110         _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1111         _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
1112         _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
1113         _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
1114         _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
1115         _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
1116         _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
1117         _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
1118         _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
1119         _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
1120         _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
1121         _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
1122         _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
1123         _tA         = _mm256_sub_ps(_tA, _t5); \
1124         _tB         = _mm256_sub_ps(_tB, _t7); \
1125         _tC         = _mm256_sub_ps(_tC, _t1); \
1126         _tD         = _mm256_sub_ps(_tD, _t9); \
1127         _tE         = _mm256_sub_ps(_tE, _t6); \
1128         _tF         = _mm256_sub_ps(_tF, _t8); \
1129         _tG         = _mm256_sub_ps(_tG, _t2); \
1130         _tH         = _mm256_sub_ps(_tH, _t10); \
1131         _mm256_storeu_ps(ptrA, _tA); \
1132         _mm256_storeu_ps(ptrB, _tB); \
1133         _mm256_storeu_ps(ptrC, _tC); \
1134         _mm256_storeu_ps(ptrD, _tD); \
1135         _mm256_storeu_ps(ptrE, _tE); \
1136         _mm256_storeu_ps(ptrF, _tF); \
1137         _mm256_storeu_ps(ptrG, _tG); \
1138         _mm256_storeu_ps(ptrH, _tH); \
1139         _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
1140         _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
1141         _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
1142         _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
1143         _t1         = _mm256_unpacklo_ps(_z3, _x4); \
1144         _t2         = _mm256_unpackhi_ps(_z3, _x4); \
1145         _t3         = _mm256_unpacklo_ps(_y4, _z4); \
1146         _t4         = _mm256_unpackhi_ps(_y4, _z4); \
1147         _t5         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1148         _t6         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1149         _t7         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1150         _t8         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1151         _tI         = _mm256_sub_ps(_tI, _t5); \
1152         _tJ         = _mm256_sub_ps(_tJ, _t6); \
1153         _tK         = _mm256_sub_ps(_tK, _t7); \
1154         _tL         = _mm256_sub_ps(_tL, _t8); \
1155         _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
1156         _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
1157         _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
1158         _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
1159         _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
1160         _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
1161         _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
1162         _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
1163     }
1164 #else
1165 /* Real function for sane compilers */
1166 static gmx_inline void
1167 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
1168                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
1169                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
1170                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
1171                                           __m256 x1, __m256 y1, __m256 z1,
1172                                           __m256 x2, __m256 y2, __m256 z2,
1173                                           __m256 x3, __m256 y3, __m256 z3,
1174                                           __m256 x4, __m256 y4, __m256 z4)
1175 {
1176     __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
1177     __m256 tA, tB, tC, tD, tE, tF, tG, tH;
1178     __m256 tI, tJ, tK, tL;
1179
1180     tA          = _mm256_loadu_ps(ptrA);
1181     tB          = _mm256_loadu_ps(ptrB);
1182     tC          = _mm256_loadu_ps(ptrC);
1183     tD          = _mm256_loadu_ps(ptrD);
1184     tE          = _mm256_loadu_ps(ptrE);
1185     tF          = _mm256_loadu_ps(ptrF);
1186     tG          = _mm256_loadu_ps(ptrG);
1187     tH          = _mm256_loadu_ps(ptrH);
1188
1189     t1          = _mm256_unpacklo_ps(x1, y1);                         /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1190     t2          = _mm256_unpackhi_ps(x1, y1);                         /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1191     t3          = _mm256_unpacklo_ps(z1, x2);                         /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1192     t4          = _mm256_unpackhi_ps(z1, x2);                         /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1193
1194     t5          = _mm256_unpacklo_ps(y2, z2);                         /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1195     t6          = _mm256_unpackhi_ps(y2, z2);                         /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1196     t7          = _mm256_unpacklo_ps(x3, y3);                         /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1197     t8          = _mm256_unpackhi_ps(x3, y3);                         /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1198
1199     t9          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1200     t10         = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1201     t11         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1202     t12         = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1203
1204     t1          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1205     t2          = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1206     t3          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1207     t4          = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1208
1209     t5          = gmx_mm256_unpack128lo_ps(t9, t1);                   /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1210     t6          = gmx_mm256_unpack128hi_ps(t9, t1);                   /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1211     t7          = gmx_mm256_unpack128lo_ps(t10, t2);                  /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1212     t8          = gmx_mm256_unpack128hi_ps(t10, t2);                  /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1213     t1          = gmx_mm256_unpack128lo_ps(t11, t3);                  /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1214     t2          = gmx_mm256_unpack128hi_ps(t11, t3);                  /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1215     t9          = gmx_mm256_unpack128lo_ps(t12, t4);                  /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1216     t10         = gmx_mm256_unpack128hi_ps(t12, t4);                  /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1217
1218     tA          = _mm256_sub_ps(tA, t5);
1219     tB          = _mm256_sub_ps(tB, t7);
1220     tC          = _mm256_sub_ps(tC, t1);
1221     tD          = _mm256_sub_ps(tD, t9);
1222     tE          = _mm256_sub_ps(tE, t6);
1223     tF          = _mm256_sub_ps(tF, t8);
1224     tG          = _mm256_sub_ps(tG, t2);
1225     tH          = _mm256_sub_ps(tH, t10);
1226
1227     _mm256_storeu_ps(ptrA, tA);
1228     _mm256_storeu_ps(ptrB, tB);
1229     _mm256_storeu_ps(ptrC, tC);
1230     _mm256_storeu_ps(ptrD, tD);
1231     _mm256_storeu_ps(ptrE, tE);
1232     _mm256_storeu_ps(ptrF, tF);
1233     _mm256_storeu_ps(ptrG, tG);
1234     _mm256_storeu_ps(ptrH, tH);
1235
1236     tI          = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8));
1237     tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8));
1238     tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8));
1239     tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8));
1240
1241     t1          = _mm256_unpacklo_ps(z3, x4);                         /* x4f z3f x4e z3e | x4b z3b x4a z3a */
1242     t2          = _mm256_unpackhi_ps(z3, x4);                         /* x4h z3h x4g z3g | x4d z3d x4c z3c */
1243     t3          = _mm256_unpacklo_ps(y4, z4);                         /* z4f y4f z4e y4e | z4b y4b z4a y4a */
1244     t4          = _mm256_unpackhi_ps(y4, z4);                         /* z4h y4h z4g y4g | z4d y4d z4c y4c */
1245
1246     t5          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
1247     t6          = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
1248     t7          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
1249     t8          = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
1250
1251     tI          = _mm256_sub_ps(tI, t5);
1252     tJ          = _mm256_sub_ps(tJ, t6);
1253     tK          = _mm256_sub_ps(tK, t7);
1254     tL          = _mm256_sub_ps(tL, t8);
1255
1256     _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(tI));
1257     _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(tJ));
1258     _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(tK));
1259     _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(tL));
1260     _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1261     _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1262     _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1263     _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1264 }
1265 #endif
1266
1267
1268 static gmx_inline void
1269 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1270                                          float * gmx_restrict fptr,
1271                                          float * gmx_restrict fshiftptr)
1272 {
1273     __m128 t1, t2, t3;
1274
1275     fix1 = _mm256_hadd_ps(fix1, fix1);
1276     fiy1 = _mm256_hadd_ps(fiy1, fiz1);
1277     fix1 = _mm256_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1278
1279     /* Add across the two lanes */
1280     t1   = _mm_add_ps(_mm256_castps256_ps128(fix1), _mm256_extractf128_ps(fix1, 0x1));
1281
1282     t2 = _mm_load_ss(fptr);
1283     t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
1284     t3 = _mm_load_ss(fshiftptr);
1285     t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
1286
1287     t2 = _mm_add_ps(t2, t1);
1288     t3 = _mm_add_ps(t3, t1);
1289
1290     _mm_store_ss(fptr, t2);
1291     _mm_storeh_pi((__m64 *)(fptr+1), t2);
1292     _mm_store_ss(fshiftptr, t3);
1293     _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
1294 }
1295
1296 #if defined (_MSC_VER) && defined(_M_IX86)
1297 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1298 #define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
1299                                                  fptr, fshiftptr) \
1300     { \
1301         __m256 _t1, _t2, _t3; \
1302         __m128 _tA, _tB, _tC; \
1303 \
1304         fix1 = _mm256_hadd_ps(fix1, fiy1); \
1305         fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1306         fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1307         fix3 = _mm256_hadd_ps(fix3, fiy3); \
1308         fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1309         fix1 = _mm256_hadd_ps(fix1, fiz1); \
1310         fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1311         fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1312 \
1313         _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1314         _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1315         _t1  = _mm256_add_ps(_t1, _t2); \
1316         _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1317         _t3  = _mm256_loadu_ps(fptr); \
1318         _t3  = _mm256_add_ps(_t3, _t1); \
1319         _mm256_storeu_ps(fptr, _t3); \
1320         _tB  = _mm_load_ss(fptr+8); \
1321         _tB  = _mm_add_ss(_tB, _tA); \
1322         _mm_store_ss(fptr+8, _tB); \
1323 \
1324         _tB  = _mm256_extractf128_ps(_t1, 0x1); \
1325         _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1326         _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1327         _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1328         _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1329         _tA  = _mm_add_ps(_tB, _tC); \
1330         _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1331         _tC  = _mm_loadu_ps(fshiftptr); \
1332         _tC  = _mm_add_ps(_tC, _tA); \
1333         _mm_storeu_ps(fshiftptr, _tC); \
1334     }
1335 #else
1336 /* Real function for sane compilers */
1337 static gmx_inline void
1338 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1339                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1340                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1341                                          float * gmx_restrict fptr,
1342                                          float * gmx_restrict fshiftptr)
1343 {
1344     __m256 t1, t2, t3;
1345     __m128 tA, tB, tC;
1346
1347     fix1 = _mm256_hadd_ps(fix1, fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1348     fiz1 = _mm256_hadd_ps(fiz1, fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1349     fiy2 = _mm256_hadd_ps(fiy2, fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1350     fix3 = _mm256_hadd_ps(fix3, fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1351     fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /*  0       0       Z3g+Z3h Z3e+Z3f | 0       0       Z3c+Z3d Z3a+Z3b */
1352
1353     fix1 = _mm256_hadd_ps(fix1, fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1354     fiy2 = _mm256_hadd_ps(fiy2, fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1355     fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /*  0       0       0       Z3e-h   | 0       0       0       Z3a-d   */
1356
1357     /* Add across the two lanes by swapping and adding back */
1358     t1   = gmx_mm256_unpack128lo_ps(fix1, fiy2);                                       /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1359     t2   = gmx_mm256_unpack128hi_ps(fix1, fiy2);                                       /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1360     t1   = _mm256_add_ps(t1, t2);                                                      /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1361
1362     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* 0 0 0 z3 */
1363
1364     t3   = _mm256_loadu_ps(fptr);
1365     t3   = _mm256_add_ps(t3, t1);
1366     _mm256_storeu_ps(fptr, t3);
1367     tB   = _mm_load_ss(fptr+8);
1368     tB   = _mm_add_ss(tB, tA);
1369     _mm_store_ss(fptr+8, tB);
1370
1371     /* Add up shift force */
1372     tB   = _mm256_extractf128_ps(t1, 0x1);                                          /* y3 x3 z2 y2 */
1373     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1374     tB   = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2));                         /* 0 z3 y3 x3 */
1375     tC   = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0));                             /*  - z2 y2 x2 */
1376
1377     tB   = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1378     tA   = _mm_add_ps(tB, tC);                      /*  - z y x */
1379
1380     tA   = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1381
1382     tC   = _mm_loadu_ps(fshiftptr);
1383     tC   = _mm_add_ps(tC, tA);
1384     _mm_storeu_ps(fshiftptr, tC);
1385 }
1386 #endif
1387
1388
1389 #if defined (_MSC_VER) && defined(_M_IX86)
1390 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1391 #define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1392                                                  fptr, fshiftptr) \
1393     { \
1394         __m256 _t1, _t2, _t3; \
1395         __m128 _tA, _tB, _tC; \
1396 \
1397         fix1 = _mm256_hadd_ps(fix1, fiy1); \
1398         fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1399         fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1400         fix3 = _mm256_hadd_ps(fix3, fiy3); \
1401         fiz3 = _mm256_hadd_ps(fiz3, fix4); \
1402         fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
1403 \
1404         fix1 = _mm256_hadd_ps(fix1, fiz1); \
1405         fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1406         fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
1407 \
1408         _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1409         _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1410         _t1  = _mm256_add_ps(_t1, _t2); \
1411         _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1412         _t3  = _mm256_loadu_ps(fptr); \
1413         _t3  = _mm256_add_ps(_t3, _t1); \
1414         _mm256_storeu_ps(fptr, _t3); \
1415         _tB  = _mm_loadu_ps(fptr+8); \
1416         _tB  = _mm_add_ps(_tB, _tA); \
1417         _mm_storeu_ps(fptr+8, _tB); \
1418 \
1419         _tB  = _mm256_extractf128_ps(_t1, 0x1); \
1420         _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1421         _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1422         _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1423         _tA  = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
1424         _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1425         _tA  = _mm_add_ps(_tA, _tC); \
1426         _tA  = _mm_add_ps(_tA, _tB); \
1427         _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1428         _tC  = _mm_loadu_ps(fshiftptr); \
1429         _tC  = _mm_add_ps(_tC, _tA); \
1430         _mm_storeu_ps(fshiftptr, _tC); \
1431     }
1432 #else
1433 /* Real function for sane compilers */
1434 static gmx_inline void
1435 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1436                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1437                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1438                                          __m256 fix4, __m256 fiy4, __m256 fiz4,
1439                                          float * gmx_restrict fptr,
1440                                          float * gmx_restrict fshiftptr)
1441 {
1442     __m256 t1, t2, t3;
1443     __m128 tA, tB, tC;
1444
1445     fix1 = _mm256_hadd_ps(fix1, fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1446     fiz1 = _mm256_hadd_ps(fiz1, fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1447     fiy2 = _mm256_hadd_ps(fiy2, fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1448     fix3 = _mm256_hadd_ps(fix3, fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1449     fiz3 = _mm256_hadd_ps(fiz3, fix4);                /*  X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1450     fiy4 = _mm256_hadd_ps(fiy4, fiz4);                /*  Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1451
1452     fix1 = _mm256_hadd_ps(fix1, fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1453     fiy2 = _mm256_hadd_ps(fiy2, fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1454     fiz3 = _mm256_hadd_ps(fiz3, fiy4);                /*  Z4e-h   Y4e-h   X4e-h   Z3e-h   | Z4a-d   Y4a-d   X4a-d   Z3a-d   */
1455
1456     /* Add across the two lanes by swapping and adding back */
1457     t1   = gmx_mm256_unpack128lo_ps(fix1, fiy2);                                       /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1458     t2   = gmx_mm256_unpack128hi_ps(fix1, fiy2);                                       /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1459     t1   = _mm256_add_ps(t1, t2);                                                      /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1460
1461     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* z4 y4 x4 z3 */
1462
1463     t3   = _mm256_loadu_ps(fptr);
1464     t3   = _mm256_add_ps(t3, t1);
1465     _mm256_storeu_ps(fptr, t3);
1466
1467     tB   = _mm_loadu_ps(fptr+8);
1468     tB   = _mm_add_ps(tB, tA);
1469     _mm_storeu_ps(fptr+8, tB);
1470
1471     /* Add up shift force */
1472     tB   = _mm256_extractf128_ps(t1, 0x1);                                          /* y3 x3 z2 y2 */
1473     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1474     tB   = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2));                         /* 0 z3 y3 x3 */
1475     tC   = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0));                             /*  - z2 y2 x2 */
1476     tA   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 3, 2, 1));                             /* - z4 y4 x4 */
1477
1478     tB   = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1479     tA   = _mm_add_ps(tA, tC);
1480     tA   = _mm_add_ps(tA, tB);
1481
1482     tA   = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1483
1484     tC   = _mm_loadu_ps(fshiftptr);
1485     tC   = _mm_add_ps(tC, tA);
1486     _mm_storeu_ps(fshiftptr, tC);
1487 }
1488 #endif
1489
1490
1491
1492 static gmx_inline void
1493 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1494 {
1495     __m128 t1;
1496
1497     pot1 = _mm256_hadd_ps(pot1, pot1);
1498     pot1 = _mm256_hadd_ps(pot1, pot1);
1499
1500     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1501
1502     _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1503 }
1504
1505 static gmx_inline void
1506 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1507                          __m256 pot2, float * gmx_restrict ptrB)
1508 {
1509     __m128 t1, t2;
1510
1511     pot1 = _mm256_hadd_ps(pot1, pot2);
1512     pot1 = _mm256_hadd_ps(pot1, pot1);
1513
1514     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1515
1516     t2   = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
1517     _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1518     _mm_store_ss(ptrB, _mm_add_ss(_mm_load_ss(ptrB), t2));
1519 }
1520
1521
1522 #endif /* _kernelutil_x86_avx_256_single_h_ */