src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_256_single_h_
  30 #define _kernelutil_x86_avx_256_single_h_
  31
  32 #include "gmx_x86_avx_256.h"
  33
  34 /* Transpose lower/upper half of 256-bit registers separately */
  35 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0,ymm1,ymm2,ymm3) {            \
  36     __m256 __tmp0,__tmp1,__tmp2,__tmp3;                               \
  37                                                                       \
  38     __tmp0   = _mm256_unpacklo_ps((ymm0),(ymm1));                     \
  39     __tmp1   = _mm256_unpacklo_ps((ymm2),(ymm3));                     \
  40     __tmp2   = _mm256_unpackhi_ps((ymm0),(ymm1));                     \
  41     __tmp3   = _mm256_unpackhi_ps((ymm2),(ymm3));                     \
  42     ymm0     = _mm256_shuffle_ps(__tmp0,__tmp1,_MM_SHUFFLE(1,0,1,0)); \
  43     ymm1     = _mm256_shuffle_ps(__tmp0,__tmp1,_MM_SHUFFLE(3,2,3,2)); \
  44     ymm2     = _mm256_shuffle_ps(__tmp2,__tmp3,_MM_SHUFFLE(1,0,1,0)); \
  45     ymm3     = _mm256_shuffle_ps(__tmp2,__tmp3,_MM_SHUFFLE(3,2,3,2)); \
  46 }
  47
  48
  49 static gmx_inline __m256
  50 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
  51 {
  52     return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx,dx), _mm256_mul_ps(dy,dy) ), _mm256_mul_ps(dz,dz) );
  53 }
  54
  55 /* Normal sum of four ymm registers */
  56 #define gmx_mm256_sum4_ps(t0,t1,t2,t3)  _mm256_add_ps(_mm256_add_ps(t0,t1),_mm256_add_ps(t2,t3))
  57
  58
  59 static gmx_inline int
  60 gmx_mm256_any_lt(__m256 a, __m256 b)
  61 {
  62     return _mm256_movemask_ps(_mm256_cmp_ps(a,b,_CMP_LT_OQ));
  63 }
  64
  65
  66 static gmx_inline __m256
  67 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  68                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
  69 {
  70     __m128 t1,t2;
  71
  72     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
  73     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
  74     return _mm256_castps128_ps256(_mm_unpacklo_ps(t1,t2));
  75 }
  76
  77
  78 static gmx_inline __m256
  79 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
  80                                 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
  81                                 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
  82                                 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
  83 {
  84     __m256 t1,t2;
  85
  86     t1 = gmx_mm256_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
  87     t2 = gmx_mm256_load_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH);
  88
  89     return _mm256_permute2f128_ps(t1,t2,0x20);
  90 }
  91
  92
  93
  94 static gmx_inline void
  95 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
  96                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
  97 {
  98     __m256 t2,t3,t4;
  99
 100     t2       = _mm256_permute_ps(xmm1,_MM_SHUFFLE(1,1,1,1));
 101     t3       = _mm256_permute_ps(xmm1,_MM_SHUFFLE(2,2,2,2));
 102     t4       = _mm256_permute_ps(xmm1,_MM_SHUFFLE(3,3,3,3));
 103     _mm_store_ss(ptrA,_mm256_castps256_ps128(xmm1));
 104     _mm_store_ss(ptrB,_mm256_castps256_ps128(t2));
 105     _mm_store_ss(ptrC,_mm256_castps256_ps128(t3));
 106     _mm_store_ss(ptrD,_mm256_castps256_ps128(t4));
 107 }
 108
 109
 110 static gmx_inline void
 111 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 112                                  float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 113                                  float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 114                                  float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
 115 {
 116     __m256 t1;
 117
 118     t1 = _mm256_permute2f128_ps(xmm1,xmm1,0x11);
 119
 120     gmx_mm256_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,xmm1);
 121     gmx_mm256_store_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH,t1);
 122 }
 123
 124
 125 static gmx_inline void
 126 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 127                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 128                                      __m256 xmm1)
 129 {
 130     __m128 t1,t2,t3,t4;
 131
 132     t1   = _mm256_castps256_ps128(xmm1);
 133     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 134     t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
 135     t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
 136
 137     t1   = _mm_add_ss(t1,_mm_load_ss(ptrA));
 138     t2   = _mm_add_ss(t2,_mm_load_ss(ptrB));
 139     t3   = _mm_add_ss(t3,_mm_load_ss(ptrC));
 140     t4   = _mm_add_ss(t4,_mm_load_ss(ptrD));
 141
 142     _mm_store_ss(ptrA,t1);
 143     _mm_store_ss(ptrB,t2);
 144     _mm_store_ss(ptrC,t3);
 145     _mm_store_ss(ptrD,t4);
 146 }
 147
 148 static gmx_inline void
 149 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 150                                      float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 151                                      float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 152                                      float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 153                                      __m256 xmm1)
 154 {
 155     __m256 t1;
 156
 157     t1 = _mm256_permute2f128_ps(xmm1,xmm1,0x11);
 158
 159     gmx_mm256_increment_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,xmm1);
 160     gmx_mm256_increment_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH,t1);
 161 }
 162
 163
 164 static gmx_inline void
 165 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 166                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 167                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 168 {
 169     __m128 t1,t2,t3,t4;
 170
 171     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
 172     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
 173     t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
 174     t4   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);   /* - - c12d  c6d */
 175
 176     t1   = _mm_unpacklo_ps(t1,t2); /* c12b c12a c6b c6a */
 177     t3   = _mm_unpacklo_ps(t3,t4); /* c12d c12c c6d c6c */
 178
 179     *c6  = _mm256_castps128_ps256(_mm_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)));
 180     *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)));
 181 }
 182
 183 static gmx_inline void
 184 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
 185                                 const float * gmx_restrict p3, const float * gmx_restrict p4,
 186                                 const float * gmx_restrict p5, const float * gmx_restrict p6,
 187                                 const float * gmx_restrict p7, const float * gmx_restrict p8,
 188                                 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
 189 {
 190     __m256 c6l,c6h,c12l,c12h;
 191
 192     gmx_mm256_load_4pair_swizzle_ps(p1,p2,p3,p4,&c6l,&c12l);
 193     gmx_mm256_load_4pair_swizzle_ps(p5,p6,p7,p8,&c6h,&c12h);
 194
 195     *c6  = _mm256_permute2f128_ps(c6l,c6h,0x20);
 196     *c12 = _mm256_permute2f128_ps(c12l,c12h,0x20);
 197 }
 198
 199
 200 static gmx_inline void
 201 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 202                                             const float * gmx_restrict xyz,
 203                                             __m256 * gmx_restrict x1,
 204                                             __m256 * gmx_restrict y1,
 205                                             __m256 * gmx_restrict z1)
 206 {
 207     __m128 t1,t2,t3,t4;
 208
 209     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 210     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
 211     t3   = _mm_load_ss(xyz_shift+2);
 212     t4   = _mm_load_ss(xyz+2);
 213     t1   = _mm_add_ps(t1,t2);
 214     t3   = _mm_add_ss(t3,t4);
 215
 216     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 217     t1   = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 218     t3   = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 219
 220     *x1  = gmx_mm256_set_m128(t1,t1);
 221     *y1  = gmx_mm256_set_m128(t2,t2);
 222     *z1  = gmx_mm256_set_m128(t3,t3);
 223 }
 224
 225
 226 static gmx_inline void
 227 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 228                                             const float * gmx_restrict xyz,
 229                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 230                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 231                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 232 {
 233     __m128 tA,tB;
 234     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
 235
 236     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 237     tB   = _mm_load_ss(xyz_shift+2);
 238
 239     t1   = _mm_loadu_ps(xyz);
 240     t2   = _mm_loadu_ps(xyz+4);
 241     t3   = _mm_load_ss(xyz+8);
 242
 243     tA   = _mm_movelh_ps(tA,tB);
 244     t4   = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
 245     t5   = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
 246     t6   = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
 247
 248     t1   = _mm_add_ps(t1,t4);
 249     t2   = _mm_add_ps(t2,t5);
 250     t3   = _mm_add_ss(t3,t6);
 251
 252     t9   = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 253     t8   = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
 254     t7   = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
 255     t6   = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
 256     t5   = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
 257     t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
 258     t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
 259     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 260     t1   = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 261
 262     *x1  = gmx_mm256_set_m128(t1,t1);
 263     *y1  = gmx_mm256_set_m128(t2,t2);
 264     *z1  = gmx_mm256_set_m128(t3,t3);
 265     *x2  = gmx_mm256_set_m128(t4,t4);
 266     *y2  = gmx_mm256_set_m128(t5,t5);
 267     *z2  = gmx_mm256_set_m128(t6,t6);
 268     *x3  = gmx_mm256_set_m128(t7,t7);
 269     *y3  = gmx_mm256_set_m128(t8,t8);
 270     *z3  = gmx_mm256_set_m128(t9,t9);
 271 }
 272
 273
 274 static gmx_inline void
 275 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 276                                             const float * gmx_restrict xyz,
 277                                             __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 278                                             __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 279                                             __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 280                                             __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 281 {
 282     __m128 tA,tB;
 283     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 284
 285     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 286     tB   = _mm_load_ss(xyz_shift+2);
 287
 288     t1   = _mm_loadu_ps(xyz);
 289     t2   = _mm_loadu_ps(xyz+4);
 290     t3   = _mm_loadu_ps(xyz+8);
 291
 292     tA   = _mm_movelh_ps(tA,tB);
 293     t4   = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
 294     t5   = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
 295     t6   = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
 296
 297     t1   = _mm_add_ps(t1,t4);
 298     t2   = _mm_add_ps(t2,t5);
 299     t3   = _mm_add_ps(t3,t6);
 300
 301     t12  = _mm_permute_ps(t3,_MM_SHUFFLE(3,3,3,3));
 302     t11  = _mm_permute_ps(t3,_MM_SHUFFLE(2,2,2,2));
 303     t10  = _mm_permute_ps(t3,_MM_SHUFFLE(1,1,1,1));
 304     t9   = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 305     t8   = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
 306     t7   = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
 307     t6   = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
 308     t5   = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
 309     t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
 310     t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
 311     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 312     t1   = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 313
 314     *x1  = gmx_mm256_set_m128(t1,t1);
 315     *y1  = gmx_mm256_set_m128(t2,t2);
 316     *z1  = gmx_mm256_set_m128(t3,t3);
 317     *x2  = gmx_mm256_set_m128(t4,t4);
 318     *y2  = gmx_mm256_set_m128(t5,t5);
 319     *z2  = gmx_mm256_set_m128(t6,t6);
 320     *x3  = gmx_mm256_set_m128(t7,t7);
 321     *y3  = gmx_mm256_set_m128(t8,t8);
 322     *z3  = gmx_mm256_set_m128(t9,t9);
 323     *x4  = gmx_mm256_set_m128(t10,t10);
 324     *y4  = gmx_mm256_set_m128(t11,t11);
 325     *z4  = gmx_mm256_set_m128(t12,t12);
 326 }
 327
 328
 329
 330 static gmx_inline void
 331 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 332                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 333                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 334 {
 335     __m128 t1,t2,t3,t4;
 336     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 337     t1             = gmx_mm_maskload_ps(ptrA,mask);
 338     t2             = gmx_mm_maskload_ps(ptrB,mask);
 339     t3             = gmx_mm_maskload_ps(ptrC,mask);
 340     t4             = gmx_mm_maskload_ps(ptrD,mask);
 341     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 342     *x1           = _mm256_castps128_ps256(t1);
 343     *y1           = _mm256_castps128_ps256(t2);
 344     *z1           = _mm256_castps128_ps256(t3);
 345 }
 346
 347
 348 static gmx_inline void
 349 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 350                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 351                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 352                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 353                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 354 {
 355     __m128 t1,t2,t3,t4;
 356     t1            = _mm_loadu_ps(ptrA);
 357     t2            = _mm_loadu_ps(ptrB);
 358     t3            = _mm_loadu_ps(ptrC);
 359     t4            = _mm_loadu_ps(ptrD);
 360     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 361     *x1           = _mm256_castps128_ps256(t1);
 362     *y1           = _mm256_castps128_ps256(t2);
 363     *z1           = _mm256_castps128_ps256(t3);
 364     *x2           = _mm256_castps128_ps256(t4);
 365     t1            = _mm_loadu_ps(ptrA+4);
 366     t2            = _mm_loadu_ps(ptrB+4);
 367     t3            = _mm_loadu_ps(ptrC+4);
 368     t4            = _mm_loadu_ps(ptrD+4);
 369     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 370     *y2           = _mm256_castps128_ps256(t1);
 371     *z2           = _mm256_castps128_ps256(t2);
 372     *x3           = _mm256_castps128_ps256(t3);
 373     *y3           = _mm256_castps128_ps256(t4);
 374     t1            = _mm_load_ss(ptrA+8);
 375     t2            = _mm_load_ss(ptrB+8);
 376     t3            = _mm_load_ss(ptrC+8);
 377     t4            = _mm_load_ss(ptrD+8);
 378     t1            = _mm_unpacklo_ps(t1,t3);
 379     t3            = _mm_unpacklo_ps(t2,t4);
 380     *z3           = _mm256_castps128_ps256(_mm_unpacklo_ps(t1,t3));
 381 }
 382
 383
 384
 385 static gmx_inline void
 386 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 387                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 388                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 389                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 390                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 391                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 392 {
 393     __m128 t1,t2,t3,t4;
 394     t1            = _mm_loadu_ps(ptrA);
 395     t2            = _mm_loadu_ps(ptrB);
 396     t3            = _mm_loadu_ps(ptrC);
 397     t4            = _mm_loadu_ps(ptrD);
 398     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 399     *x1           = _mm256_castps128_ps256(t1);
 400     *y1           = _mm256_castps128_ps256(t2);
 401     *z1           = _mm256_castps128_ps256(t3);
 402     *x2           = _mm256_castps128_ps256(t4);
 403     t1            = _mm_loadu_ps(ptrA+4);
 404     t2            = _mm_loadu_ps(ptrB+4);
 405     t3            = _mm_loadu_ps(ptrC+4);
 406     t4            = _mm_loadu_ps(ptrD+4);
 407     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 408     *y2           = _mm256_castps128_ps256(t1);
 409     *z2           = _mm256_castps128_ps256(t2);
 410     *x3           = _mm256_castps128_ps256(t3);
 411     *y3           = _mm256_castps128_ps256(t4);
 412     t1            = _mm_loadu_ps(ptrA+8);
 413     t2            = _mm_loadu_ps(ptrB+8);
 414     t3            = _mm_loadu_ps(ptrC+8);
 415     t4            = _mm_loadu_ps(ptrD+8);
 416     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 417     *z3           = _mm256_castps128_ps256(t1);
 418     *x4           = _mm256_castps128_ps256(t2);
 419     *y4           = _mm256_castps128_ps256(t3);
 420     *z4           = _mm256_castps128_ps256(t4);
 421 }
 422
 423
 424 static gmx_inline void
 425 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 426                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 427                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 428                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 429                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
 430 {
 431     __m256 t1,t2,t3,t4,t5,t6,t7,t8;
 432     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 433
 434     t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
 435     t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
 436     t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
 437     t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
 438
 439     t5            = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
 440     t6            = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
 441     t7            = _mm256_unpackhi_ps(t1,t2); /*  -  - zF zE |  -  - zB zA */
 442     t8            = _mm256_unpackhi_ps(t3,t4); /*  -  - zH zG |  -  - zD zC */
 443
 444     *x1           = _mm256_shuffle_ps(t5,t6,_MM_SHUFFLE(1,0,1,0));
 445     *y1           = _mm256_shuffle_ps(t5,t6,_MM_SHUFFLE(3,2,3,2));
 446     *z1           = _mm256_shuffle_ps(t7,t8,_MM_SHUFFLE(1,0,1,0));
 447 }
 448
 449
 450 static gmx_inline void
 451 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 452                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 453                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 454                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 455                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 456                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 457                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
 458 {
 459     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 460
 461     t1           = _mm256_loadu_ps(ptrA);        /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 462     t2           = _mm256_loadu_ps(ptrB);        /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 463     t3           = _mm256_loadu_ps(ptrC);        /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 464     t4           = _mm256_loadu_ps(ptrD);        /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 465     t5           = _mm256_loadu_ps(ptrE);        /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 466     t6           = _mm256_loadu_ps(ptrF);        /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 467     t7           = _mm256_loadu_ps(ptrG);        /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 468     t8           = _mm256_loadu_ps(ptrH);        /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 469
 470     t9           = _mm256_unpacklo_ps(t1,t2);    /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 471     t10          = _mm256_unpackhi_ps(t1,t2);    /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 472     t11          = _mm256_unpacklo_ps(t3,t4);    /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 473     t12          = _mm256_unpackhi_ps(t3,t4);    /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 474     t1           = _mm256_unpacklo_ps(t5,t6);    /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 475     t2           = _mm256_unpackhi_ps(t5,t6);    /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 476     t3           = _mm256_unpacklo_ps(t7,t8);    /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 477     t4           = _mm256_unpackhi_ps(t7,t8);    /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 478
 479     t5           = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(1,0,1,0));   /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 480     t6           = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(3,2,3,2));   /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 481     t7           = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(1,0,1,0));  /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 482     t8           = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(3,2,3,2));  /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 483
 484     t9           = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));    /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 485     t10          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));    /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 486     t11          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0));    /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 487     t12          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2));    /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 488
 489     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 490     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 491     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 492     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 493
 494     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 495     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 496     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 497     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 498
 499     t1           = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
 500     t2           = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
 501     t3           = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
 502     t4           = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
 503
 504     t1           = _mm256_unpacklo_ps(t1,t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
 505     t2           = _mm256_unpacklo_ps(t2,t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
 506
 507     *z3          = _mm256_unpacklo_ps(t1,t2);
 508 }
 509
 510
 511
 512 static gmx_inline void
 513 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 514                                      const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 515                                      const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
 516                                      const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
 517                                      __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
 518                                      __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
 519                                      __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
 520                                      __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
 521 {
 522     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 523
 524     t1           = _mm256_loadu_ps(ptrA);        /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 525     t2           = _mm256_loadu_ps(ptrB);        /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 526     t3           = _mm256_loadu_ps(ptrC);        /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 527     t4           = _mm256_loadu_ps(ptrD);        /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 528     t5           = _mm256_loadu_ps(ptrE);        /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 529     t6           = _mm256_loadu_ps(ptrF);        /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 530     t7           = _mm256_loadu_ps(ptrG);        /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 531     t8           = _mm256_loadu_ps(ptrH);        /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 532
 533     t9           = _mm256_unpacklo_ps(t1,t2);    /* z2b z2a y2b y2a | y1b y1a x1b x1a */
 534     t10          = _mm256_unpackhi_ps(t1,t2);    /* y3b y3a x3b x3a | x2b x2a z1b z1a */
 535     t11          = _mm256_unpacklo_ps(t3,t4);    /* z2d z2c y2d y2c | y1d y1c x1d x1c */
 536     t12          = _mm256_unpackhi_ps(t3,t4);    /* y3d y3c x3d x3c | x2d x2c z1d z1c */
 537     t1           = _mm256_unpacklo_ps(t5,t6);    /* z2f z2e y2f y2e | y1f y1e x1f x1e */
 538     t2           = _mm256_unpackhi_ps(t5,t6);    /* y3f y3e x3f x3e | x2f x2e z1f z1e */
 539     t3           = _mm256_unpacklo_ps(t7,t8);    /* z2h z2g y2h y2g | y1h y1g x1h x1g */
 540     t4           = _mm256_unpackhi_ps(t7,t8);    /* y3h y3g x3h x3g | x2h x2g z1h z1g */
 541
 542     t5           = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(1,0,1,0));   /* y2d y2c y2b y2a | x1d x1c x1b x1a */
 543     t6           = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(3,2,3,2));   /* z2d z2c z2b z2a | y1d y1c y1b y1a */
 544     t7           = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(1,0,1,0));  /* x3d x3c x3b x3a | z1d z1c z1b z1a */
 545     t8           = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(3,2,3,2));  /* y3d y3c y3b y3a | x2d x2c x2b x2a */
 546     t9           = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));    /* y2h y2g y2f y2e | x1h x1g x1f x1e */
 547     t10          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));    /* z2h z2g z2f z2e | y1h y1g y1f y1e */
 548     t11          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0));    /* x3h x3g x3f x3e | z1h z1g z1f z1e */
 549     t12          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2));    /* y3h y3g y3f y3e | x2h x2g x2f x2e */
 550
 551     *x1          = _mm256_permute2f128_ps(t5, t9,  0x20);
 552     *y1          = _mm256_permute2f128_ps(t6, t10, 0x20);
 553     *z1          = _mm256_permute2f128_ps(t7, t11, 0x20);
 554     *x2          = _mm256_permute2f128_ps(t8, t12, 0x20);
 555
 556     *y2          = _mm256_permute2f128_ps(t5, t9,  0x31);
 557     *z2          = _mm256_permute2f128_ps(t6, t10, 0x31);
 558     *x3          = _mm256_permute2f128_ps(t7, t11, 0x31);
 559     *y3          = _mm256_permute2f128_ps(t8, t12, 0x31);
 560
 561     t1           = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
 562     t2           = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
 563     t3           = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
 564     t4           = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
 565
 566     t5           = _mm256_unpacklo_ps(t1,t2); /* x4f x4e z3f z3e | x4b x4a z3b z3a */
 567     t6           = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
 568     t7           = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
 569     t8           = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
 570
 571     *z3          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
 572     *x4          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
 573     *y4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
 574     *z4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* z4h z4g z4f z4e | z4d z4c z4b z4a */
 575 }
 576
 577
 578 static gmx_inline void
 579 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 580                                           float * gmx_restrict ptrC,float * gmx_restrict ptrD,
 581                                           __m256 x1, __m256 y1, __m256 z1)
 582 {
 583     __m128 t1,t2,t3,t4,t5,t6,t7,t8;
 584     __m128i mask;
 585
 586     /* Construct a mask without executing any data loads */
 587     mask        = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
 588
 589     t3          = _mm_unpacklo_ps(_mm256_castps256_ps128(x1),_mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
 590     t4          = _mm_unpackhi_ps(_mm256_castps256_ps128(x1),_mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
 591
 592     t1          = _mm_shuffle_ps(t3,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,0,1,0)); /*  -  z1a y1a x1a */
 593     t2          = _mm_shuffle_ps(t3,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,1,3,2)); /*  -  z1b y1b x1b */
 594     t3          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /*  -  z1c y1c x1c */
 595     t4          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /*  -  z1d y1d x1d */
 596
 597     t5          = gmx_mm_maskload_ps(ptrA,mask);
 598     t6          = gmx_mm_maskload_ps(ptrB,mask);
 599     t7          = gmx_mm_maskload_ps(ptrC,mask);
 600     t8          = gmx_mm_maskload_ps(ptrD,mask);
 601
 602     t5          = _mm_sub_ps(t5,t1);
 603     t6          = _mm_sub_ps(t6,t2);
 604     t7          = _mm_sub_ps(t7,t3);
 605     t8          = _mm_sub_ps(t8,t4);
 606
 607     gmx_mm_maskstore_ps(ptrA,mask,t5);
 608     gmx_mm_maskstore_ps(ptrB,mask,t6);
 609     gmx_mm_maskstore_ps(ptrC,mask,t7);
 610     gmx_mm_maskstore_ps(ptrD,mask,t8);
 611 }
 612
 613
 614
 615 static gmx_inline void
 616 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 617                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 618                                           __m256 x1, __m256 y1, __m256 z1,
 619                                           __m256 x2, __m256 y2, __m256 z2,
 620                                           __m256 x3, __m256 y3, __m256 z3)
 621 {
 622     __m256 t1,t2,t3,t4,t5,t6;
 623     __m128 tA,tB,tC,tD;
 624
 625     t1          = _mm256_loadu_ps(ptrA);
 626     t2          = _mm256_loadu_ps(ptrB);
 627     t3          = _mm256_loadu_ps(ptrC);
 628     t4          = _mm256_loadu_ps(ptrD);
 629     tA          = _mm_load_ss(ptrA+8);
 630     tB          = _mm_load_ss(ptrB+8);
 631     tC          = _mm_load_ss(ptrC+8);
 632     tD          = _mm_load_ss(ptrD+8);
 633
 634     t5          = _mm256_unpacklo_ps(x1,y1); /* - - - - | y1b x1b y1a x1a */
 635     x1          = _mm256_unpackhi_ps(x1,y1); /* - - - - | y1d x1d y1c x1c */
 636     y1          = _mm256_unpacklo_ps(z1,x2); /* - - - - | x2b z1b x2a z1a */
 637     z1          = _mm256_unpackhi_ps(z1,x2); /* - - - - | x2d z1d x2c z1c */
 638
 639     x2          = _mm256_unpacklo_ps(y2,z2); /* - - - - | z2b y2b z2a y2a */
 640     y2          = _mm256_unpackhi_ps(y2,z2); /* - - - - | z2d y2d z2c y2c */
 641     t6          = _mm256_unpacklo_ps(x3,y3); /* - - - - | y3b x3b y3a x3a */
 642     x3          = _mm256_unpackhi_ps(x3,y3); /* - - - - | y3d x3d y3c x3c */
 643
 644     t5          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 645     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 646
 647     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 648     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 649
 650     z2          = _mm256_shuffle_ps(t5,y1,_MM_SHUFFLE(1,0,1,0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 651     t5          = _mm256_shuffle_ps(t5,y1,_MM_SHUFFLE(3,2,3,2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 652     y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 653     x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 654
 655     t1          = _mm256_sub_ps(t1,z2);
 656     t2          = _mm256_sub_ps(t2,t5);
 657     t3          = _mm256_sub_ps(t3,y1);
 658     t4          = _mm256_sub_ps(t4,x1);
 659
 660     tA          = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
 661     tB          = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));
 662     tC          = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));
 663     tD          = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));
 664
 665     /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
 666     _mm256_storeu_ps(ptrA,t1);
 667     _mm256_storeu_ps(ptrB,t2);
 668     _mm256_storeu_ps(ptrC,t3);
 669     _mm256_storeu_ps(ptrD,t4);
 670     _mm_store_ss(ptrA+8,tA);
 671     _mm_store_ss(ptrB+8,tB);
 672     _mm_store_ss(ptrC+8,tC);
 673     _mm_store_ss(ptrD+8,tD);
 674 }
 675
 676
 677 static gmx_inline void
 678 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 679                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 680                                           __m256 x1, __m256 y1, __m256 z1,
 681                                           __m256 x2, __m256 y2, __m256 z2,
 682                                           __m256 x3, __m256 y3, __m256 z3,
 683                                           __m256 x4, __m256 y4, __m256 z4)
 684 {
 685     __m256 t1,t2,t3,t4,t5;
 686     __m128 tA,tB,tC,tD,tE,tF,tG,tH;
 687
 688     t1          = _mm256_loadu_ps(ptrA);
 689     t2          = _mm256_loadu_ps(ptrB);
 690     t3          = _mm256_loadu_ps(ptrC);
 691     t4          = _mm256_loadu_ps(ptrD);
 692     tA          = _mm_loadu_ps(ptrA+8);
 693     tB          = _mm_loadu_ps(ptrB+8);
 694     tC          = _mm_loadu_ps(ptrC+8);
 695     tD          = _mm_loadu_ps(ptrD+8);
 696
 697     t5          = _mm256_unpacklo_ps(x1,y1); /* - - - - | y1b x1b y1a x1a */
 698     x1          = _mm256_unpackhi_ps(x1,y1); /* - - - - | y1d x1d y1c x1c */
 699     y1          = _mm256_unpacklo_ps(z1,x2); /* - - - - | x2b z1b x2a z1a */
 700     z1          = _mm256_unpackhi_ps(z1,x2); /* - - - - | x2d z1d x2c z1c */
 701
 702     x2          = _mm256_unpacklo_ps(y2,z2); /* - - - - | z2b y2b z2a y2a */
 703     y2          = _mm256_unpackhi_ps(y2,z2); /* - - - - | z2d y2d z2c y2c */
 704     z2          = _mm256_unpacklo_ps(x3,y3); /* - - - - | y3b x3b y3a x3a */
 705     x3          = _mm256_unpackhi_ps(x3,y3); /* - - - - | y3d x3d y3c x3c */
 706
 707     y3          = _mm256_unpacklo_ps(z3,x4); /* - - - - | x4b z3b x4a z3a */
 708     z3          = _mm256_unpackhi_ps(z3,x4); /* - - - - | x4d z3d x4c z3c */
 709     x4          = _mm256_unpacklo_ps(y4,z4); /* - - - - | z4b y4b z4a y4a */
 710     y4          = _mm256_unpackhi_ps(y4,z4); /* - - - - | z4d y4d z4c y4c */
 711
 712     x2          = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
 713     x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
 714     y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
 715     z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
 716
 717     z2          = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 718     t5          = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 719     y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 720     x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 721
 722     tE          = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0)); /* z4a y4a x4a z3a */
 723     tF          = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2)); /* z4b y4b x4b z3b */
 724
 725     tG          = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0)); /* z4c y4c x4c z3c */
 726     tH          = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2)); /* z4d y4d x4d z3d */
 727
 728     t1          = _mm256_sub_ps(t1,z2);
 729     t2          = _mm256_sub_ps(t2,t5);
 730     t3          = _mm256_sub_ps(t3,y1);
 731     t4          = _mm256_sub_ps(t4,x1);
 732
 733     tA          = _mm_sub_ps(tA,tE);
 734     tB          = _mm_sub_ps(tB,tF);
 735     tC          = _mm_sub_ps(tC,tG);
 736     tD          = _mm_sub_ps(tD,tH);
 737
 738     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 739     _mm256_storeu_ps(ptrA,t1);
 740     _mm256_storeu_ps(ptrB,t2);
 741     _mm256_storeu_ps(ptrC,t3);
 742     _mm256_storeu_ps(ptrD,t4);
 743     _mm_storeu_ps(ptrA+8,tA);
 744     _mm_storeu_ps(ptrB+8,tB);
 745     _mm_storeu_ps(ptrC+8,tC);
 746     _mm_storeu_ps(ptrD+8,tD);
 747 }
 748
 749
 750
 751 static gmx_inline void
 752 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 753                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 754                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 755                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 756                                           __m256 x1, __m256 y1, __m256 z1)
 757 {
 758     __m256 t1,t2,t3,t4,t5,t6;
 759     __m256 tA,tB,tC,tD;
 760     __m128i mask;
 761
 762     /* Construct a mask without executing any data loads */
 763     mask        = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
 764
 765     tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
 766     tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
 767     tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
 768     tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
 769     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
 770     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 771
 772     t3          = _mm256_shuffle_ps(t1,z1,_MM_SHUFFLE(0,0,1,0)); /*  -  z1e y1e x1e |  - z1a y1a x1a */
 773     t4          = _mm256_shuffle_ps(t1,z1,_MM_SHUFFLE(0,1,3,2)); /*  -  z1f y1f x1f |  - z1b y1b x1b */
 774     t5          = _mm256_shuffle_ps(t2,z1,_MM_SHUFFLE(0,2,1,0)); /*  -  z1g y1g x1g |  - z1c y1c x1c */
 775     t6          = _mm256_shuffle_ps(t2,z1,_MM_SHUFFLE(0,3,3,2)); /*  -  z1h y1h x1h |  - z1d y1d x1d */
 776
 777     tA          = _mm256_sub_ps(tA,t3);
 778     tB          = _mm256_sub_ps(tB,t4);
 779     tC          = _mm256_sub_ps(tC,t5);
 780     tD          = _mm256_sub_ps(tD,t6);
 781
 782     gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
 783     gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
 784     gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
 785     gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
 786     gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
 787     gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
 788     gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
 789     gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
 790 }
 791
 792
 793
 794 static gmx_inline void
 795 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 796                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 797                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 798                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 799                                           __m256 x1, __m256 y1, __m256 z1,
 800                                           __m256 x2, __m256 y2, __m256 z2,
 801                                           __m256 x3, __m256 y3, __m256 z3)
 802 {
 803     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 804     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
 805     __m256 tI,tJ,tK,tL;
 806
 807     tA          = _mm256_loadu_ps(ptrA);
 808     tB          = _mm256_loadu_ps(ptrB);
 809     tC          = _mm256_loadu_ps(ptrC);
 810     tD          = _mm256_loadu_ps(ptrD);
 811     tE          = _mm256_loadu_ps(ptrE);
 812     tF          = _mm256_loadu_ps(ptrF);
 813     tG          = _mm256_loadu_ps(ptrG);
 814     tH          = _mm256_loadu_ps(ptrH);
 815
 816     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
 817     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 818     t3          = _mm256_unpacklo_ps(z1,x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
 819     t4          = _mm256_unpackhi_ps(z1,x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
 820
 821     t5          = _mm256_unpacklo_ps(y2,z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
 822     t6          = _mm256_unpackhi_ps(y2,z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
 823     t7          = _mm256_unpacklo_ps(x3,y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
 824     t8          = _mm256_unpackhi_ps(x3,y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
 825
 826     t9          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
 827     t10         = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
 828     t11         = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
 829     t12         = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
 830
 831     t1          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
 832     t2          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
 833     t3          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
 834     t4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
 835
 836     t5          = gmx_mm256_unpack128lo_ps(t9,t1);  /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 837     t6          = gmx_mm256_unpack128hi_ps(t9,t1);  /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 838     t7          = gmx_mm256_unpack128lo_ps(t10,t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 839     t8          = gmx_mm256_unpack128hi_ps(t10,t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 840     t1          = gmx_mm256_unpack128lo_ps(t11,t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 841     t2          = gmx_mm256_unpack128hi_ps(t11,t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 842     t9          = gmx_mm256_unpack128lo_ps(t12,t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 843     t10         = gmx_mm256_unpack128hi_ps(t12,t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 844
 845     tA          = _mm256_sub_ps(tA,t5);
 846     tB          = _mm256_sub_ps(tB,t7);
 847     tC          = _mm256_sub_ps(tC,t1);
 848     tD          = _mm256_sub_ps(tD,t9);
 849     tE          = _mm256_sub_ps(tE,t6);
 850     tF          = _mm256_sub_ps(tF,t8);
 851     tG          = _mm256_sub_ps(tG,t2);
 852     tH          = _mm256_sub_ps(tH,t10);
 853
 854     _mm256_storeu_ps(ptrA,tA);
 855     _mm256_storeu_ps(ptrB,tB);
 856     _mm256_storeu_ps(ptrC,tC);
 857     _mm256_storeu_ps(ptrD,tD);
 858     _mm256_storeu_ps(ptrE,tE);
 859     _mm256_storeu_ps(ptrF,tF);
 860     _mm256_storeu_ps(ptrG,tG);
 861     _mm256_storeu_ps(ptrH,tH);
 862
 863     tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
 864     tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
 865     tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
 866     tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
 867
 868     tI          = _mm256_unpacklo_ps(tI,tK);  /*  -  - zG zE |  -  - zC zA */
 869     tJ          = _mm256_unpacklo_ps(tJ,tL);  /*  -  - zH zF |  -  - zD zB */
 870     tI          = _mm256_unpacklo_ps(tI,tJ);  /* zH zG zF zE | zD zC zB zA */
 871
 872     tI          = _mm256_sub_ps(tI,z3);
 873     tJ          = _mm256_permute_ps(tI,_MM_SHUFFLE(1,1,1,1));
 874     tK          = _mm256_permute_ps(tI,_MM_SHUFFLE(2,2,2,2));
 875     tL          = _mm256_permute_ps(tI,_MM_SHUFFLE(3,3,3,3));
 876
 877     _mm_store_ss(ptrA+8,_mm256_castps256_ps128(tI));
 878     _mm_store_ss(ptrB+8,_mm256_castps256_ps128(tJ));
 879     _mm_store_ss(ptrC+8,_mm256_castps256_ps128(tK));
 880     _mm_store_ss(ptrD+8,_mm256_castps256_ps128(tL));
 881     _mm_store_ss(ptrE+8,_mm256_extractf128_ps(tI,0x1));
 882     _mm_store_ss(ptrF+8,_mm256_extractf128_ps(tJ,0x1));
 883     _mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
 884     _mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 885 }
 886
 887
 888 static gmx_inline void
 889 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 890                                           float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 891                                           float * gmx_restrict ptrE, float * gmx_restrict ptrF,
 892                                           float * gmx_restrict ptrG, float * gmx_restrict ptrH,
 893                                           __m256 x1, __m256 y1, __m256 z1,
 894                                           __m256 x2, __m256 y2, __m256 z2,
 895                                           __m256 x3, __m256 y3, __m256 z3,
 896                                           __m256 x4, __m256 y4, __m256 z4)
 897 {
 898     __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 899     __m256 tA,tB,tC,tD,tE,tF,tG,tH;
 900     __m256 tI,tJ,tK,tL;
 901
 902     tA          = _mm256_loadu_ps(ptrA);
 903     tB          = _mm256_loadu_ps(ptrB);
 904     tC          = _mm256_loadu_ps(ptrC);
 905     tD          = _mm256_loadu_ps(ptrD);
 906     tE          = _mm256_loadu_ps(ptrE);
 907     tF          = _mm256_loadu_ps(ptrF);
 908     tG          = _mm256_loadu_ps(ptrG);
 909     tH          = _mm256_loadu_ps(ptrH);
 910
 911     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
 912     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 913     t3          = _mm256_unpacklo_ps(z1,x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
 914     t4          = _mm256_unpackhi_ps(z1,x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
 915
 916     t5          = _mm256_unpacklo_ps(y2,z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
 917     t6          = _mm256_unpackhi_ps(y2,z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
 918     t7          = _mm256_unpacklo_ps(x3,y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
 919     t8          = _mm256_unpackhi_ps(x3,y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
 920
 921     t9          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
 922     t10         = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
 923     t11         = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
 924     t12         = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
 925
 926     t1          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
 927     t2          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
 928     t3          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
 929     t4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
 930
 931     t5          = gmx_mm256_unpack128lo_ps(t9,t1);  /* y3a x3a z2a y2a | x2a z1a y1a x1a */
 932     t6          = gmx_mm256_unpack128hi_ps(t9,t1);  /* y3e x3e z2e y2e | x2e z1e y1e x1e */
 933     t7          = gmx_mm256_unpack128lo_ps(t10,t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
 934     t8          = gmx_mm256_unpack128hi_ps(t10,t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
 935     t1          = gmx_mm256_unpack128lo_ps(t11,t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
 936     t2          = gmx_mm256_unpack128hi_ps(t11,t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
 937     t9          = gmx_mm256_unpack128lo_ps(t12,t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
 938     t10         = gmx_mm256_unpack128hi_ps(t12,t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
 939
 940     tA          = _mm256_sub_ps(tA,t5);
 941     tB          = _mm256_sub_ps(tB,t7);
 942     tC          = _mm256_sub_ps(tC,t1);
 943     tD          = _mm256_sub_ps(tD,t9);
 944     tE          = _mm256_sub_ps(tE,t6);
 945     tF          = _mm256_sub_ps(tF,t8);
 946     tG          = _mm256_sub_ps(tG,t2);
 947     tH          = _mm256_sub_ps(tH,t10);
 948
 949     _mm256_storeu_ps(ptrA,tA);
 950     _mm256_storeu_ps(ptrB,tB);
 951     _mm256_storeu_ps(ptrC,tC);
 952     _mm256_storeu_ps(ptrD,tD);
 953     _mm256_storeu_ps(ptrE,tE);
 954     _mm256_storeu_ps(ptrF,tF);
 955     _mm256_storeu_ps(ptrG,tG);
 956     _mm256_storeu_ps(ptrH,tH);
 957
 958     tI          = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));
 959     tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
 960     tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
 961     tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
 962
 963     t1          = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
 964     t2          = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
 965     t3          = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
 966     t4          = _mm256_unpackhi_ps(y4,z4); /* z4h y4h z4g y4g | z4d y4d z4c y4c */
 967
 968     t5          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
 969     t6          = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
 970     t7          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
 971     t8          = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
 972
 973     tI          = _mm256_sub_ps(tI,t5);
 974     tJ          = _mm256_sub_ps(tJ,t6);
 975     tK          = _mm256_sub_ps(tK,t7);
 976     tL          = _mm256_sub_ps(tL,t8);
 977
 978     _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(tI));
 979     _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(tJ));
 980     _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(tK));
 981     _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(tL));
 982     _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(tI,0x1));
 983     _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(tJ,0x1));
 984     _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
 985     _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
 986 }
 987
 988
 989
 990 static gmx_inline void
 991 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
 992                                       float * gmx_restrict fptr,
 993                                       float * gmx_restrict fshiftptr)
 994 {
 995     __m128 t1,t2,t3;
 996
 997     fix1 = _mm256_hadd_ps(fix1,fix1);
 998     fiy1 = _mm256_hadd_ps(fiy1,fiz1);
 999     fix1 = _mm256_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1000
1001     /* Add across the two lanes */
1002     t1   = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
1003
1004     t2 = _mm_load_ss(fptr);
1005     t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
1006     t3 = _mm_load_ss(fshiftptr);
1007     t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
1008
1009     t2 = _mm_add_ps(t2,t1);
1010     t3 = _mm_add_ps(t3,t1);
1011
1012     _mm_store_ss(fptr,t2);
1013     _mm_storeh_pi((__m64 *)(fptr+1),t2);
1014     _mm_store_ss(fshiftptr,t3);
1015     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
1016 }
1017
1018 static gmx_inline void
1019 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1020                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1021                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1022                                          float * gmx_restrict fptr,
1023                                          float * gmx_restrict fshiftptr)
1024 {
1025     __m256 t1,t2,t3;
1026     __m128 tA,tB,tC;
1027
1028     fix1 = _mm256_hadd_ps(fix1,fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1029     fiz1 = _mm256_hadd_ps(fiz1,fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1030     fiy2 = _mm256_hadd_ps(fiy2,fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1031     fix3 = _mm256_hadd_ps(fix3,fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1032     fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps()); /*  0       0       Z3g+Z3h Z3e+Z3f | 0       0       Z3c+Z3d Z3a+Z3b */
1033
1034     fix1 = _mm256_hadd_ps(fix1,fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1035     fiy2 = _mm256_hadd_ps(fiy2,fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1036     fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps()); /*  0       0       0       Z3e-h   | 0       0       0       Z3a-d   */
1037
1038     /* Add across the two lanes by swapping and adding back */
1039     t1   = gmx_mm256_unpack128lo_ps(fix1,fiy2); /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1040     t2   = gmx_mm256_unpack128hi_ps(fix1,fiy2); /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1041     t1   = _mm256_add_ps(t1,t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1042
1043     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));  /* 0 0 0 z3 */
1044
1045     t3   = _mm256_loadu_ps(fptr);
1046     t3   = _mm256_add_ps(t3,t1);
1047     _mm256_storeu_ps(fptr,t3);
1048     tB   = _mm_load_ss(fptr+8);
1049     tB   = _mm_add_ss(tB,tA);
1050     _mm_store_ss(fptr+8,tB);
1051
1052     /* Add up shift force */
1053     tB   = _mm256_extractf128_ps(t1,0x1); /* y3 x3 z2 y2 */
1054     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1),tB,_MM_SHUFFLE(1,0,3,3)); /* z2 y2 x2 x2 */
1055     tB   = _mm_shuffle_ps(tB,tA,_MM_SHUFFLE(1,0,3,2)); /* 0 z3 y3 x3 */
1056     tC   = _mm_permute_ps(tC,_MM_SHUFFLE(3,3,2,0)); /*  - z2 y2 x2 */
1057
1058     tB   = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
1059     tA   = _mm_add_ps(tB,tC); /*  - z y x */
1060
1061     tA   = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
1062
1063     tC   = _mm_loadu_ps(fshiftptr);
1064     tC   = _mm_add_ps(tC,tA);
1065     _mm_storeu_ps(fshiftptr,tC);
1066 }
1067
1068
1069 static gmx_inline void
1070 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1071                                          __m256 fix2, __m256 fiy2, __m256 fiz2,
1072                                          __m256 fix3, __m256 fiy3, __m256 fiz3,
1073                                          __m256 fix4, __m256 fiy4, __m256 fiz4,
1074                                          float * gmx_restrict fptr,
1075                                          float * gmx_restrict fshiftptr)
1076 {
1077     __m256 t1,t2,t3;
1078     __m128 tA,tB,tC;
1079
1080     fix1 = _mm256_hadd_ps(fix1,fiy1);                /*  Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1081     fiz1 = _mm256_hadd_ps(fiz1,fix2);                /*  X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1082     fiy2 = _mm256_hadd_ps(fiy2,fiz2);                /*  Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1083     fix3 = _mm256_hadd_ps(fix3,fiy3);                /*  Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1084     fiz3 = _mm256_hadd_ps(fiz3,fix4);                /*  X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1085     fiy4 = _mm256_hadd_ps(fiy4,fiz4);                /*  Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1086
1087     fix1 = _mm256_hadd_ps(fix1,fiz1);                /*  X2e-h   Z1e-h   Y1e-h   X1e-h   | X2a-d   Z1a-d   Y1a-d   X1a-d   */
1088     fiy2 = _mm256_hadd_ps(fiy2,fix3);                /*  Y3e-h   X3e-h   Z2e-h   Y2e-h   | Y3a-d   X3a-d   Z2a-d   Y2a-d   */
1089     fiz3 = _mm256_hadd_ps(fiz3,fiy4);                /*  Z4e-h   Y4e-h   X4e-h   Z3e-h   | Z4a-d   Y4a-d   X4a-d   Z3a-d   */
1090
1091     /* Add across the two lanes by swapping and adding back */
1092     t1   = gmx_mm256_unpack128lo_ps(fix1,fiy2); /*  Y3a-d   X3a-d   Z2a-d   Y2a-d | X2a-d   Z1a-d   Y1a-d   X1a-d */
1093     t2   = gmx_mm256_unpack128hi_ps(fix1,fiy2); /*  Y3e-h   X3e-h   Z2e-h   Y2e-h | X2e-h   Z1e-h   Y1e-h   X1e-h */
1094     t1   = _mm256_add_ps(t1,t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1095
1096     tA   = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));  /* z4 y4 x4 z3 */
1097
1098     t3   = _mm256_loadu_ps(fptr);
1099     t3   = _mm256_add_ps(t3,t1);
1100     _mm256_storeu_ps(fptr,t3);
1101
1102     tB   = _mm_loadu_ps(fptr+8);
1103     tB   = _mm_add_ps(tB,tA);
1104     _mm_storeu_ps(fptr+8,tB);
1105
1106     /* Add up shift force */
1107     tB   = _mm256_extractf128_ps(t1,0x1); /* y3 x3 z2 y2 */
1108     tC   = _mm_shuffle_ps(_mm256_castps256_ps128(t1),tB,_MM_SHUFFLE(1,0,3,3)); /* z2 y2 x2 x2 */
1109     tB   = _mm_shuffle_ps(tB,tA,_MM_SHUFFLE(1,0,3,2)); /* 0 z3 y3 x3 */
1110     tC   = _mm_permute_ps(tC,_MM_SHUFFLE(3,3,2,0)); /*  - z2 y2 x2 */
1111     tA   = _mm_permute_ps(tA,_MM_SHUFFLE(0,3,2,1)); /* - z4 y4 x4 */
1112
1113     tB   = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
1114     tA   = _mm_add_ps(tA,tC);
1115     tA   = _mm_add_ps(tA,tB);
1116
1117     tA   = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
1118
1119     tC   = _mm_loadu_ps(fshiftptr);
1120     tC   = _mm_add_ps(tC,tA);
1121     _mm_storeu_ps(fshiftptr,tC);
1122 }
1123
1124
1125
1126 static gmx_inline void
1127 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1128 {
1129     __m128 t1;
1130
1131     pot1 = _mm256_hadd_ps(pot1,pot1);
1132     pot1 = _mm256_hadd_ps(pot1,pot1);
1133
1134     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1135
1136     _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1137 }
1138
1139 static gmx_inline void
1140 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1141                          __m256 pot2, float * gmx_restrict ptrB)
1142 {
1143     __m128 t1,t2;
1144
1145     pot1 = _mm256_hadd_ps(pot1,pot2);
1146     pot1 = _mm256_hadd_ps(pot1,pot1);
1147
1148     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1149
1150     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
1151     _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1152     _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
1153 }
1154
1155
1156 static gmx_inline void
1157 gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1158                          __m256 pot2, float * gmx_restrict ptrB,
1159                          __m256 pot3, float * gmx_restrict ptrC,
1160                          __m256 pot4, float * gmx_restrict ptrD)
1161 {
1162     __m128 t1,t2,t3,t4;
1163
1164     pot1 = _mm256_hadd_ps(pot1,pot2);
1165     pot3 = _mm256_hadd_ps(pot3,pot4);
1166     pot1 = _mm256_hadd_ps(pot1,pot3);
1167     t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1168     t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
1169     t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
1170     t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
1171     _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1172     _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
1173     _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
1174     _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
1175 }
1176
1177
1178 #endif /* _kernelutil_x86_avx_256_single_h_ */