src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_256_double_h_
  36 #define _kernelutil_x86_avx_256_double_h_
  37
  38
  39 #include "gromacs/simd/general_x86_avx_256.h"
  40
  41
  42 static int
  43 gmx_mm256_any_lt(__m256d a, __m256d b)
  44 {
  45     return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  46 }
  47
  48 static gmx_inline __m256d
  49 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  50 {
  51     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
  52 }
  53
  54 /* Normal sum of four ymm registers */
  55 #define gmx_mm256_sum4_pd(t0, t1, t2, t3)  _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
  56
  57
  58 /* Load a single value from 1-4 places, merge into xmm register */
  59 static __m256d
  60 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
  61 {
  62     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
  63 }
  64
  65 static __m256d
  66 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  67                                 const double * gmx_restrict ptrB)
  68 {
  69     __m128d tA, tB;
  70
  71     tA = _mm_load_sd(ptrA);
  72     tB = _mm_load_sd(ptrB);
  73
  74     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
  75 }
  76
  77
  78 static __m256d
  79 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
  80                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
  81 {
  82     __m128d t1, t2;
  83
  84     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  85     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
  86     return gmx_mm256_set_m128d(t2, t1);
  87 }
  88
  89
  90
  91 static void
  92 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  93 {
  94     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
  95 }
  96
  97
  98 static void
  99 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 100 {
 101     __m256d t2;
 102
 103     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 104     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 105     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 106 }
 107
 108
 109
 110
 111 static void
 112 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 113                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 114 {
 115     __m256d t2;
 116     __m128d t3, t4;
 117
 118     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 119     t3       = _mm256_extractf128_pd(xmm1, 0x1);
 120     t4       = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 121     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 122     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 123     _mm_store_sd(ptrC, t3);
 124     _mm_store_sd(ptrD, t4);
 125 }
 126
 127
 128
 129
 130 static void
 131 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 132 {
 133     __m128d t1;
 134
 135     t1   = _mm256_castpd256_pd128(xmm1);
 136     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 137
 138     _mm_store_sd(ptrA, t1);
 139 }
 140
 141
 142 static void
 143 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 144 {
 145     __m128d t1, t2;
 146
 147     t1   = _mm256_castpd256_pd128(xmm1);
 148     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 149
 150     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 151     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 152
 153     _mm_store_sd(ptrA, t1);
 154     _mm_store_sd(ptrB, t2);
 155 }
 156
 157
 158 static void
 159 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 160                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 161 {
 162     __m128d t1, t2, t3, t4;
 163
 164     t1   = _mm256_castpd256_pd128(xmm1);
 165     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 166     t3   = _mm256_extractf128_pd(xmm1, 0x1);
 167     t4   = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 168
 169     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 170     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 171     t3   = _mm_add_sd(t3, _mm_load_sd(ptrC));
 172     t4   = _mm_add_sd(t4, _mm_load_sd(ptrD));
 173
 174     _mm_store_sd(ptrA, t1);
 175     _mm_store_sd(ptrB, t2);
 176     _mm_store_sd(ptrC, t3);
 177     _mm_store_sd(ptrD, t4);
 178 }
 179
 180
 181
 182 static void
 183 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 184 {
 185     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 186     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 187 }
 188
 189
 190 static void
 191 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 192 {
 193     __m128d t1, t2, t3;
 194
 195     t1   = _mm_loadu_pd(p1);
 196     t2   = _mm_loadu_pd(p2);
 197     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
 198     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
 199 }
 200
 201
 202
 203 static void
 204 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 205                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 206                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 207 {
 208     __m256d t1, t2;
 209
 210     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 211     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 212
 213     *c6  = _mm256_unpacklo_pd(t1, t2);                              /* c6d c6c | c6b c6a */
 214     *c12 = _mm256_unpackhi_pd(t1, t2);                              /* c12d c12c | c12b c12a */
 215 }
 216
 217
 218 static gmx_inline void
 219 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 220                                             const double * gmx_restrict xyz,
 221                                             __m256d * gmx_restrict      x1,
 222                                             __m256d * gmx_restrict      y1,
 223                                             __m256d * gmx_restrict      z1)
 224 {
 225     __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
 226
 227     mem_xy  = _mm_loadu_pd(xyz);
 228     mem_z   = _mm_load_sd(xyz+2);
 229     mem_sxy = _mm_loadu_pd(xyz_shift);
 230     mem_sz  = _mm_load_sd(xyz_shift+2);
 231
 232     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 233     mem_z   = _mm_add_pd(mem_z, mem_sz);
 234
 235     tx  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 236     ty  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 237     tz  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 238
 239     *x1 = gmx_mm256_set_m128d(tx, tx);
 240     *y1 = gmx_mm256_set_m128d(ty, ty);
 241     *z1 = gmx_mm256_set_m128d(tz, tz);
 242 }
 243
 244
 245 static gmx_inline void
 246 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 247                                             const double * gmx_restrict xyz,
 248                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 249                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 250                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 251 {
 252     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
 253
 254     t1  = _mm_loadu_pd(xyz);
 255     t2  = _mm_loadu_pd(xyz+2);
 256     t3  = _mm_loadu_pd(xyz+4);
 257     t4  = _mm_loadu_pd(xyz+6);
 258     t5  = _mm_load_sd(xyz+8);
 259
 260     sxy = _mm_loadu_pd(xyz_shift);
 261     sz  = _mm_load_sd(xyz_shift+2);
 262     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 263     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 264
 265     t1  = _mm_add_pd(t1, sxy);
 266     t2  = _mm_add_pd(t2, szx);
 267     t3  = _mm_add_pd(t3, syz);
 268     t4  = _mm_add_pd(t4, sxy);
 269     t5  = _mm_add_sd(t5, sz);
 270
 271     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 272     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 273     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 274     *x1  = gmx_mm256_set_m128d(tx, tx);
 275     *y1  = gmx_mm256_set_m128d(ty, ty);
 276     *z1  = gmx_mm256_set_m128d(tz, tz);
 277     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 278     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 279     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 280     *x2  = gmx_mm256_set_m128d(tx, tx);
 281     *y2  = gmx_mm256_set_m128d(ty, ty);
 282     *z2  = gmx_mm256_set_m128d(tz, tz);
 283     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 284     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 285     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 286     *x3  = gmx_mm256_set_m128d(tx, tx);
 287     *y3  = gmx_mm256_set_m128d(ty, ty);
 288     *z3  = gmx_mm256_set_m128d(tz, tz);
 289 }
 290
 291
 292 static gmx_inline void
 293 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 294                                             const double * gmx_restrict xyz,
 295                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 296                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 297                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 298                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 299 {
 300     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
 301
 302     t1  = _mm_loadu_pd(xyz);
 303     t2  = _mm_loadu_pd(xyz+2);
 304     t3  = _mm_loadu_pd(xyz+4);
 305     t4  = _mm_loadu_pd(xyz+6);
 306     t5  = _mm_loadu_pd(xyz+8);
 307     t6  = _mm_loadu_pd(xyz+10);
 308
 309     sxy = _mm_loadu_pd(xyz_shift);
 310     sz  = _mm_load_sd(xyz_shift+2);
 311     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 312     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 313
 314     t1  = _mm_add_pd(t1, sxy);
 315     t2  = _mm_add_pd(t2, szx);
 316     t3  = _mm_add_pd(t3, syz);
 317     t4  = _mm_add_pd(t4, sxy);
 318     t5  = _mm_add_pd(t5, szx);
 319     t6  = _mm_add_pd(t6, syz);
 320
 321     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 322     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 323     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 324     *x1  = gmx_mm256_set_m128d(tx, tx);
 325     *y1  = gmx_mm256_set_m128d(ty, ty);
 326     *z1  = gmx_mm256_set_m128d(tz, tz);
 327     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 328     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 329     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 330     *x2  = gmx_mm256_set_m128d(tx, tx);
 331     *y2  = gmx_mm256_set_m128d(ty, ty);
 332     *z2  = gmx_mm256_set_m128d(tz, tz);
 333     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 334     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 335     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 336     *x3  = gmx_mm256_set_m128d(tx, tx);
 337     *y3  = gmx_mm256_set_m128d(ty, ty);
 338     *z3  = gmx_mm256_set_m128d(tz, tz);
 339     tx   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 340     ty   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 341     tz   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 342     *x4  = gmx_mm256_set_m128d(tx, tx);
 343     *y4  = gmx_mm256_set_m128d(ty, ty);
 344     *z4  = gmx_mm256_set_m128d(tz, tz);
 345 }
 346
 347
 348 static void
 349 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 350                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 351 {
 352     __m256d t1;
 353
 354     t1            = _mm256_loadu_pd(p1);
 355     *x            = t1;
 356     *y            = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 357     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 358 }
 359
 360
 361 static void
 362 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 363                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 364                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 365                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 366 {
 367     __m256d t1, t2, t3, t4;
 368
 369     t1            = _mm256_loadu_pd(p1);
 370     t3            = _mm256_loadu_pd(p1+4);
 371     *x1           = t1;
 372     *y2           = t3;
 373     t2            = gmx_mm256_unpack128hi_pd(t1, t1);
 374     t4            = gmx_mm256_unpack128hi_pd(t3, t3);
 375     *z1           = t2;
 376     *x3           = t4;
 377     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 378     *z2           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 379     *x2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 380     *y3           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 381     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 382 }
 383
 384 static void
 385 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 386                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 387                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 388                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 389                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 390 {
 391     __m256d t1, t2, t3, t4, t5, t6;
 392
 393     t1            = _mm256_loadu_pd(p1);
 394     t2            = _mm256_loadu_pd(p1+4);
 395     t3            = _mm256_loadu_pd(p1+8);
 396
 397     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 398     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
 399     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
 400
 401     *x1           = t1;
 402     *y2           = t2;
 403     *z3           = t3;
 404     *z1           = t4;
 405     *x3           = t5;
 406     *y4           = t6;
 407
 408     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 409     *z2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 410     *x4           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 411     *x2           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 412     *y3           = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 413     *z4           = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 414 }
 415
 416
 417 static void
 418 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 419                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 420                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 421 {
 422     __m256d t1, t2, t3, t4, t5, t6;
 423
 424     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 425     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 426     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 427     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 428
 429     t5           = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 430     t6           = _mm256_unpackhi_pd(t1, t2);   /*   -   -  | y1b y1a */
 431     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1c z1c | x1d x1c */
 432     t2           = _mm256_unpackhi_pd(t3, t4);   /*   -   -  | y1d y1c */
 433
 434     *x1          = gmx_mm256_unpack128lo_pd(t5, t1);
 435     *y1          = gmx_mm256_unpack128lo_pd(t6, t2);
 436     *z1          = gmx_mm256_unpack128hi_pd(t5, t1);
 437 }
 438
 439
 440
 441 static void
 442 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 443                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 444                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 445                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 446                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 447 {
 448     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 449
 450     t1           = _mm256_loadu_pd(ptrA);                       /*  x2a z1a | y1a x1a */
 451     t2           = _mm256_loadu_pd(ptrB);                       /*  x2b z1b | y1b x1b */
 452     t3           = _mm256_loadu_pd(ptrC);                       /*  x2c z1c | y1c x1c */
 453     t4           = _mm256_loadu_pd(ptrD);                       /*  x2d z1d | y1d x1d */
 454     t5           = _mm256_loadu_pd(ptrA+4);                     /*  y3a x3a | z2a y2a */
 455     t6           = _mm256_loadu_pd(ptrB+4);                     /*  y3b x3b | z2b y2b */
 456     t7           = _mm256_loadu_pd(ptrC+4);                     /*  y3c x3c | z2c y2c */
 457     t8           = _mm256_loadu_pd(ptrD+4);                     /*  y3d x3d | z2d y2d */
 458     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /*   -   -  |  -  z3a */
 459     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /*   -   -  |  -  z3b */
 460     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /*   -   -  |  -  z3c */
 461     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /*   -   -  |  -  z3d */
 462
 463     t13          = _mm256_unpacklo_pd(t1, t2);                  /*  z1b z1a | x1b x1a */
 464     t14          = _mm256_unpackhi_pd(t1, t2);                  /*  x2b x2a | y1b y1a */
 465     t1           = _mm256_unpacklo_pd(t3, t4);                  /*  z1d z1c | x1d x1c */
 466     t2           = _mm256_unpackhi_pd(t3, t4);                  /*  x2d x2c | y1d y1c */
 467
 468     t3           = _mm256_unpacklo_pd(t5, t6);                  /*  x3b x3a | y2b y2a */
 469     t4           = _mm256_unpackhi_pd(t5, t6);                  /*  y3b y3a | z2b z2a */
 470     t5           = _mm256_unpacklo_pd(t7, t8);                  /*  x3d x3c | y2d y2c */
 471     t6           = _mm256_unpackhi_pd(t7, t8);                  /*  y3d y3c | z2d z2c */
 472
 473     t9           = _mm256_unpacklo_pd(t9, t10);                 /*   -   -  | z3b z3a */
 474     t11          = _mm256_unpacklo_pd(t11, t12);                /*   -   -  | z3d z3c */
 475
 476     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 477     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 478     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 479     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 480     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 481     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 482     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 483     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 484     *z3          = gmx_mm256_unpack128lo_pd(t9, t11);
 485 }
 486
 487
 488
 489 static void
 490 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 491                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 492                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 493                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 494                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 495                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 496 {
 497     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 498
 499     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 500     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 501     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 502     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 503     t5           = _mm256_loadu_pd(ptrA+4);      /*  y3a x3a | z2a y2a */
 504     t6           = _mm256_loadu_pd(ptrB+4);      /*  y3b x3b | z2b y2b */
 505     t7           = _mm256_loadu_pd(ptrC+4);      /*  y3c x3c | z2c y2c */
 506     t8           = _mm256_loadu_pd(ptrD+4);      /*  y3d x3d | z2d y2d */
 507     t9           = _mm256_loadu_pd(ptrA+8);      /*  z4a y4a | x4a z3a */
 508     t10          = _mm256_loadu_pd(ptrB+8);      /*  z4b y4b | x4b z3b */
 509     t11          = _mm256_loadu_pd(ptrC+8);      /*  z4c y4c | x4c z3c */
 510     t12          = _mm256_loadu_pd(ptrD+8);      /*  z4d y4d | x4d z3d */
 511
 512     t13          = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 513     t14          = _mm256_unpackhi_pd(t1, t2);   /*  x2b x2a | y1b y1a */
 514     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1d z1c | x1d x1c */
 515     t2           = _mm256_unpackhi_pd(t3, t4);   /*  x2d x2c | y1d y1c */
 516
 517     t3           = _mm256_unpacklo_pd(t5, t6);   /*  x3b x3a | y2b y2a */
 518     t4           = _mm256_unpackhi_pd(t5, t6);   /*  y3b y3a | z2b z2a */
 519     t5           = _mm256_unpacklo_pd(t7, t8);   /*  x3d x3c | y2d y2c */
 520     t6           = _mm256_unpackhi_pd(t7, t8);   /*  y3d y3c | z2d z2c */
 521
 522     t7           = _mm256_unpacklo_pd(t9, t10);  /*  y4b y4a | z3b z3a */
 523     t8           = _mm256_unpackhi_pd(t9, t10);  /*  z4b z4a | x4b x4a */
 524     t9           = _mm256_unpacklo_pd(t11, t12); /*  y4d y4c | z3d z3c */
 525     t10          = _mm256_unpackhi_pd(t11, t12); /*  z4d z4c | x4d x4c */
 526
 527     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 528     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 529     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 530     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 531     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 532     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 533     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 534     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 535     *z3          = gmx_mm256_unpack128lo_pd(t7, t9);
 536     *x4          = gmx_mm256_unpack128lo_pd(t8, t10);
 537     *y4          = gmx_mm256_unpack128hi_pd(t7, t9);
 538     *z4          = gmx_mm256_unpack128hi_pd(t8, t10);
 539 }
 540
 541
 542
 543 static void
 544 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 545                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 546                                           __m256d x1, __m256d y1, __m256d z1)
 547 {
 548     __m256d t1, t2, tA, tB, tC, tD;
 549     __m256i mask;
 550
 551     t1          = _mm256_unpacklo_pd(x1, y1);       /*  y1c x1c | y1a x1a */
 552     t2          = _mm256_unpackhi_pd(x1, y1);       /*  y1d x1d | y1b x1b */
 553     x1          = gmx_mm256_unpack128lo_pd(t1, z1); /*  -  z1a | y1a x1a */
 554     y1          = gmx_mm256_unpack128hi_pd(t1, z1); /*  -  z1c | y1c x1c */
 555     z1          = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 556     t1          = gmx_mm256_unpack128lo_pd(t2, z1); /*  -  z1b | y1b x1b */
 557     z1          = gmx_mm256_unpack128hi_pd(t2, z1); /*  -  z1d | y1d x1d */
 558
 559     /* Construct a mask without executing any data loads */
 560     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 561                                                       _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
 562
 563     tA          = _mm256_loadu_pd(ptrA);
 564     tB          = _mm256_loadu_pd(ptrB);
 565     tC          = _mm256_loadu_pd(ptrC);
 566     tD          = _mm256_loadu_pd(ptrD);
 567
 568     tA          = _mm256_sub_pd(tA, x1);
 569     tB          = _mm256_sub_pd(tB, t1);
 570     tC          = _mm256_sub_pd(tC, y1);
 571     tD          = _mm256_sub_pd(tD, z1);
 572
 573     _mm256_maskstore_pd(ptrA, mask, tA);
 574     _mm256_maskstore_pd(ptrB, mask, tB);
 575     _mm256_maskstore_pd(ptrC, mask, tC);
 576     _mm256_maskstore_pd(ptrD, mask, tD);
 577 }
 578
 579
 580
 581 #if defined (_MSC_VER) && defined(_M_IX86)
 582 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 583 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 584                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 585     { \
 586         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
 587         __m128d _tA, _tB, _tC, _tD, _tE; \
 588         _t1          = _mm256_loadu_pd(ptrA); \
 589         _t2          = _mm256_loadu_pd(ptrB); \
 590         _t3          = _mm256_loadu_pd(ptrC); \
 591         _t4          = _mm256_loadu_pd(ptrD); \
 592         _t5          = _mm256_loadu_pd(ptrA+4); \
 593         _t6          = _mm256_loadu_pd(ptrB+4); \
 594         _t7          = _mm256_loadu_pd(ptrC+4); \
 595         _t8          = _mm256_loadu_pd(ptrD+4); \
 596         _tA          = _mm_load_sd(ptrA+8); \
 597         _tB          = _mm_load_sd(ptrB+8); \
 598         _tC          = _mm_load_sd(ptrC+8); \
 599         _tD          = _mm_load_sd(ptrD+8); \
 600         _t9          = _mm256_unpacklo_pd(_x1, _y1); \
 601         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 602         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 603         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 604         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 605         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 606         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 607         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 608         _t10         = gmx_mm256_unpack128lo_pd(_t9, _y1); \
 609         _y3          = gmx_mm256_unpack128hi_pd(_t9, _y1); \
 610         _t9          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 611         _y1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 612         _x1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 613         _z1          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 614         _x2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 615         _z2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 616         _t1          = _mm256_sub_pd(_t1, _t10); \
 617         _t2          = _mm256_sub_pd(_t2, _t9); \
 618         _t3          = _mm256_sub_pd(_t3, _y3); \
 619         _t4          = _mm256_sub_pd(_t4, _y1); \
 620         _t5          = _mm256_sub_pd(_t5, _x1); \
 621         _t6          = _mm256_sub_pd(_t6, _x2); \
 622         _t7          = _mm256_sub_pd(_t7, _z1); \
 623         _t8          = _mm256_sub_pd(_t8, _z2); \
 624         _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
 625         _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
 626         _tE          = _mm256_extractf128_pd(_z3, 0x1); \
 627         _tC          = _mm_sub_sd(_tC, _tE); \
 628         _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
 629         _mm256_storeu_pd(ptrA, _t1); \
 630         _mm256_storeu_pd(ptrB, _t2); \
 631         _mm256_storeu_pd(ptrC, _t3); \
 632         _mm256_storeu_pd(ptrD, _t4); \
 633         _mm256_storeu_pd(ptrA+4, _t5); \
 634         _mm256_storeu_pd(ptrB+4, _t6); \
 635         _mm256_storeu_pd(ptrC+4, _t7); \
 636         _mm256_storeu_pd(ptrD+4, _t8); \
 637         _mm_store_sd(ptrA+8, _tA); \
 638         _mm_store_sd(ptrB+8, _tB); \
 639         _mm_store_sd(ptrC+8, _tC); \
 640         _mm_store_sd(ptrD+8, _tD); \
 641     }
 642 #else
 643 /* Real function for sane compilers */
 644 static void
 645 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 646                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 647                                           __m256d x1, __m256d y1, __m256d z1,
 648                                           __m256d x2, __m256d y2, __m256d z2,
 649                                           __m256d x3, __m256d y3, __m256d z3)
 650 {
 651     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 652     __m128d tA, tB, tC, tD, tE;
 653
 654     t1          = _mm256_loadu_pd(ptrA);
 655     t2          = _mm256_loadu_pd(ptrB);
 656     t3          = _mm256_loadu_pd(ptrC);
 657     t4          = _mm256_loadu_pd(ptrD);
 658     t5          = _mm256_loadu_pd(ptrA+4);
 659     t6          = _mm256_loadu_pd(ptrB+4);
 660     t7          = _mm256_loadu_pd(ptrC+4);
 661     t8          = _mm256_loadu_pd(ptrD+4);
 662     tA          = _mm_load_sd(ptrA+8);
 663     tB          = _mm_load_sd(ptrB+8);
 664     tC          = _mm_load_sd(ptrC+8);
 665     tD          = _mm_load_sd(ptrD+8);
 666
 667     t9          = _mm256_unpacklo_pd(x1, y1);       /* y1c x1c | y1a x1a */
 668     x1          = _mm256_unpackhi_pd(x1, y1);       /* y1d x1d | y1b x1b */
 669
 670     y1          = _mm256_unpacklo_pd(z1, x2);       /* x2c z1c | x2a z1a */
 671     z1          = _mm256_unpackhi_pd(z1, x2);       /* x2d z1d | x2b z1b */
 672
 673     x2          = _mm256_unpacklo_pd(y2, z2);       /* z2c y2c | z2a y2a */
 674     y2          = _mm256_unpackhi_pd(y2, z2);       /* z2d y2d | z2b y2b */
 675
 676     z2          = _mm256_unpacklo_pd(x3, y3);       /* y3c x3c | y3a x3a */
 677     x3          = _mm256_unpackhi_pd(x3, y3);       /* y3d x3d | y3b x3b */
 678
 679     t10         = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
 680     y3          = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
 681
 682     t9          = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
 683     y1          = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
 684
 685     x1          = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
 686     z1          = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
 687
 688     x2          = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
 689     z2          = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
 690
 691     t1          = _mm256_sub_pd(t1, t10);
 692     t2          = _mm256_sub_pd(t2, t9);
 693     t3          = _mm256_sub_pd(t3, y3);
 694     t4          = _mm256_sub_pd(t4, y1);
 695     t5          = _mm256_sub_pd(t5, x1);
 696     t6          = _mm256_sub_pd(t6, x2);
 697     t7          = _mm256_sub_pd(t7, z1);
 698     t8          = _mm256_sub_pd(t8, z2);
 699
 700     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 701     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
 702     tE          = _mm256_extractf128_pd(z3, 0x1);
 703     tC          = _mm_sub_sd(tC, tE);
 704     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
 705
 706     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 707     _mm256_storeu_pd(ptrA, t1);
 708     _mm256_storeu_pd(ptrB, t2);
 709     _mm256_storeu_pd(ptrC, t3);
 710     _mm256_storeu_pd(ptrD, t4);
 711     _mm256_storeu_pd(ptrA+4, t5);
 712     _mm256_storeu_pd(ptrB+4, t6);
 713     _mm256_storeu_pd(ptrC+4, t7);
 714     _mm256_storeu_pd(ptrD+4, t8);
 715     _mm_store_sd(ptrA+8, tA);
 716     _mm_store_sd(ptrB+8, tB);
 717     _mm_store_sd(ptrC+8, tC);
 718     _mm_store_sd(ptrD+8, tD);
 719 }
 720 #endif
 721
 722 #if defined (_MSC_VER) && defined(_M_IX86)
 723 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 724 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 725                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
 726     { \
 727         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
 728         __m128d _tA, _tB, _tC, _tD, _tE; \
 729         _t1          = _mm256_loadu_pd(ptrA); \
 730         _t2          = _mm256_loadu_pd(ptrB); \
 731         _t3          = _mm256_loadu_pd(ptrC); \
 732         _t4          = _mm256_loadu_pd(ptrD); \
 733         _t5          = _mm256_loadu_pd(ptrA+4); \
 734         _t6          = _mm256_loadu_pd(ptrB+4); \
 735         _t7          = _mm256_loadu_pd(ptrC+4); \
 736         _t8          = _mm256_loadu_pd(ptrD+4); \
 737         _t9          = _mm256_loadu_pd(ptrA+8); \
 738         _t10         = _mm256_loadu_pd(ptrB+8); \
 739         _t11         = _mm256_loadu_pd(ptrC+8); \
 740         _t12         = _mm256_loadu_pd(ptrD+8); \
 741         _t13         = _mm256_unpacklo_pd(_x1, _y1); \
 742         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 743         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 744         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 745         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 746         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 747         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 748         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 749         _y3          = _mm256_unpacklo_pd(_z3, _x4); \
 750         _z3          = _mm256_unpackhi_pd(_z3, _x4); \
 751         _x4          = _mm256_unpacklo_pd(_y4, _z4); \
 752         _y4          = _mm256_unpackhi_pd(_y4, _z4); \
 753         _z4          = gmx_mm256_unpack128lo_pd(_t13, _y1); \
 754         _t13         = gmx_mm256_unpack128hi_pd(_t13, _y1); \
 755         _y1          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 756         _x1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 757         _z1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 758         _x2          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 759         _z2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 760         _y2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 761         _x3          = gmx_mm256_unpack128lo_pd(_y3, _x4); \
 762         _y3          = gmx_mm256_unpack128hi_pd(_y3, _x4); \
 763         _x4          = gmx_mm256_unpack128lo_pd(_z3, _y4); \
 764         _z3          = gmx_mm256_unpack128hi_pd(_z3, _y4); \
 765         _t1          = _mm256_sub_pd(_t1, _z4); \
 766         _t2          = _mm256_sub_pd(_t2, _y1); \
 767         _t3          = _mm256_sub_pd(_t3, _t13); \
 768         _t4          = _mm256_sub_pd(_t4, _x1); \
 769         _t5          = _mm256_sub_pd(_t5, _z1); \
 770         _t6          = _mm256_sub_pd(_t6, _z2); \
 771         _t7          = _mm256_sub_pd(_t7, _x2); \
 772         _t8          = _mm256_sub_pd(_t8, _y2); \
 773         _t9          = _mm256_sub_pd(_t9, _x3); \
 774         _t10         = _mm256_sub_pd(_t10, _x4); \
 775         _t11         = _mm256_sub_pd(_t11, _y3); \
 776         _t12         = _mm256_sub_pd(_t12, _z3); \
 777         _mm256_storeu_pd(ptrA, _t1); \
 778         _mm256_storeu_pd(ptrB, _t2); \
 779         _mm256_storeu_pd(ptrC, _t3); \
 780         _mm256_storeu_pd(ptrD, _t4); \
 781         _mm256_storeu_pd(ptrA+4, _t5); \
 782         _mm256_storeu_pd(ptrB+4, _t6); \
 783         _mm256_storeu_pd(ptrC+4, _t7); \
 784         _mm256_storeu_pd(ptrD+4, _t8); \
 785         _mm256_storeu_pd(ptrA+8, _t9); \
 786         _mm256_storeu_pd(ptrB+8, _t10); \
 787         _mm256_storeu_pd(ptrC+8, _t11); \
 788         _mm256_storeu_pd(ptrD+8, _t12); \
 789     }
 790 #else
 791 /* Real function for sane compilers */
 792 static void
 793 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 794                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 795                                           __m256d x1, __m256d y1, __m256d z1,
 796                                           __m256d x2, __m256d y2, __m256d z2,
 797                                           __m256d x3, __m256d y3, __m256d z3,
 798                                           __m256d x4, __m256d y4, __m256d z4)
 799 {
 800     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 801     __m128d tA, tB, tC, tD, tE;
 802
 803     t1          = _mm256_loadu_pd(ptrA);
 804     t2          = _mm256_loadu_pd(ptrB);
 805     t3          = _mm256_loadu_pd(ptrC);
 806     t4          = _mm256_loadu_pd(ptrD);
 807     t5          = _mm256_loadu_pd(ptrA+4);
 808     t6          = _mm256_loadu_pd(ptrB+4);
 809     t7          = _mm256_loadu_pd(ptrC+4);
 810     t8          = _mm256_loadu_pd(ptrD+4);
 811     t9          = _mm256_loadu_pd(ptrA+8);
 812     t10         = _mm256_loadu_pd(ptrB+8);
 813     t11         = _mm256_loadu_pd(ptrC+8);
 814     t12         = _mm256_loadu_pd(ptrD+8);
 815
 816     t13         = _mm256_unpacklo_pd(x1, y1);        /* y1c x1c | y1a x1a */
 817     x1          = _mm256_unpackhi_pd(x1, y1);        /* y1d x1d | y1b x1b */
 818     y1          = _mm256_unpacklo_pd(z1, x2);        /* x2c z1c | x2a z1a */
 819     z1          = _mm256_unpackhi_pd(z1, x2);        /* x2d z1d | x2b z1b */
 820     x2          = _mm256_unpacklo_pd(y2, z2);        /* z2c y2c | z2a y2a */
 821     y2          = _mm256_unpackhi_pd(y2, z2);        /* z2d y2d | z2b y2b */
 822     z2          = _mm256_unpacklo_pd(x3, y3);        /* y3c x3c | y3a x3a */
 823     x3          = _mm256_unpackhi_pd(x3, y3);        /* y3d x3d | y3b x3b */
 824     y3          = _mm256_unpacklo_pd(z3, x4);        /* x4c z3c | x4a z3a */
 825     z3          = _mm256_unpackhi_pd(z3, x4);        /* x4d z3d | x4b z3b */
 826     x4          = _mm256_unpacklo_pd(y4, z4);        /* z4c y4c | z4a y4a */
 827     y4          = _mm256_unpackhi_pd(y4, z4);        /* z4d y4d | z4b y4b */
 828
 829     z4          = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
 830     t13         = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
 831     y1          = gmx_mm256_unpack128lo_pd(x1, z1);  /* x2b z1b | y1b x1b */
 832     x1          = gmx_mm256_unpack128hi_pd(x1, z1);  /* x2d z1d | y1d x1d */
 833     z1          = gmx_mm256_unpack128lo_pd(x2, z2);  /* y3a x3a | z2a y2a */
 834     x2          = gmx_mm256_unpack128hi_pd(x2, z2);  /* y3c x3c | z2c y2c */
 835     z2          = gmx_mm256_unpack128lo_pd(y2, x3);  /* y3b x3b | z2b y2b */
 836     y2          = gmx_mm256_unpack128hi_pd(y2, x3);  /* y3d x3d | z2d y2d */
 837     x3          = gmx_mm256_unpack128lo_pd(y3, x4);  /* z4a y4a | x4a z3a */
 838     y3          = gmx_mm256_unpack128hi_pd(y3, x4);  /* z4c y4c | x4c z3c */
 839     x4          = gmx_mm256_unpack128lo_pd(z3, y4);  /* z4b y4b | x4b z3b */
 840     z3          = gmx_mm256_unpack128hi_pd(z3, y4);  /* z4d y4d | x4d z3d */
 841
 842     t1          = _mm256_sub_pd(t1, z4);
 843     t2          = _mm256_sub_pd(t2, y1);
 844     t3          = _mm256_sub_pd(t3, t13);
 845     t4          = _mm256_sub_pd(t4, x1);
 846     t5          = _mm256_sub_pd(t5, z1);
 847     t6          = _mm256_sub_pd(t6, z2);
 848     t7          = _mm256_sub_pd(t7, x2);
 849     t8          = _mm256_sub_pd(t8, y2);
 850     t9          = _mm256_sub_pd(t9, x3);
 851     t10         = _mm256_sub_pd(t10, x4);
 852     t11         = _mm256_sub_pd(t11, y3);
 853     t12         = _mm256_sub_pd(t12, z3);
 854
 855     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 856     _mm256_storeu_pd(ptrA, t1);
 857     _mm256_storeu_pd(ptrB, t2);
 858     _mm256_storeu_pd(ptrC, t3);
 859     _mm256_storeu_pd(ptrD, t4);
 860     _mm256_storeu_pd(ptrA+4, t5);
 861     _mm256_storeu_pd(ptrB+4, t6);
 862     _mm256_storeu_pd(ptrC+4, t7);
 863     _mm256_storeu_pd(ptrD+4, t8);
 864     _mm256_storeu_pd(ptrA+8, t9);
 865     _mm256_storeu_pd(ptrB+8, t10);
 866     _mm256_storeu_pd(ptrC+8, t11);
 867     _mm256_storeu_pd(ptrD+8, t12);
 868 }
 869 #endif
 870
 871
 872
 873
 874
 875 static gmx_inline void
 876 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 877                                          double * gmx_restrict fptr,
 878                                          double * gmx_restrict fshiftptr)
 879 {
 880     __m256d t1, t2;
 881     __m128d tA, tB;
 882     fix1 = _mm256_hadd_pd(fix1, fiy1);
 883     fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
 884
 885     /* Add across the two lanes */
 886     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
 887     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
 888
 889     fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
 890
 891     t1   = _mm256_loadu_pd(fptr);
 892     t2   = _mm256_loadu_pd(fshiftptr);
 893
 894     t1   = _mm256_add_pd(t1, fix1);
 895     t2   = _mm256_add_pd(t2, fix1);
 896
 897     _mm256_storeu_pd(fptr, t1);
 898     _mm256_storeu_pd(fshiftptr, t2);
 899 }
 900
 901
 902
 903 #if defined (_MSC_VER) && defined(_M_IX86)
 904 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 905 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
 906                                                  fptr, fshiftptr) \
 907     { \
 908         __m256d _t1, _t2, _t3, _t4; \
 909         __m128d _tz3, _tA, _tB, _tC, _tD; \
 910         fix1  = _mm256_hadd_pd(fix1, fiy1); \
 911         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
 912         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
 913         fix3  = _mm256_hadd_pd(fix3, fiy3); \
 914         fiz3  = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
 915         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
 916         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
 917         _t1   = _mm256_add_pd(_t1, _t2); \
 918         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
 919         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
 920         _t3   = _mm256_add_pd(_t3, _t4); \
 921         _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
 922         _t2   = _mm256_loadu_pd(fptr); \
 923         _t4   = _mm256_loadu_pd(fptr+4); \
 924         _tA   = _mm_load_sd(fptr+8); \
 925         _t2   = _mm256_add_pd(_t2, _t1); \
 926         _t4   = _mm256_add_pd(_t4, _t3); \
 927         _tA   = _mm_add_sd(_tA, _tz3); \
 928         _mm256_storeu_pd(fptr, _t2); \
 929         _mm256_storeu_pd(fptr+4, _t4); \
 930         _mm_store_sd(fptr+8, _tA); \
 931         _tB   = _mm256_extractf128_pd(_t1, 0x1); \
 932         _tC   = _mm256_extractf128_pd(_t3, 0x1); \
 933         _tz3  = _mm_add_sd(_tz3, _tB); \
 934         _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
 935         _tz3  = _mm_add_sd(_tz3, _tD); \
 936         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
 937         _tD   = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
 938         _tC   = _mm_add_pd(_tC, _tD); \
 939         _tA   = _mm_loadu_pd(fshiftptr); \
 940         _tB   = _mm_load_sd(fshiftptr+2); \
 941         _tA   = _mm_add_pd(_tA, _tC); \
 942         _tB   = _mm_add_sd(_tB, _tz3); \
 943         _mm_storeu_pd(fshiftptr, _tA); \
 944         _mm_store_sd(fshiftptr+2, _tB); \
 945     }
 946 #else
 947 /* Real function for sane compilers */
 948 static gmx_inline void
 949 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 950                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 951                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 952                                          double * gmx_restrict fptr,
 953                                          double * gmx_restrict fshiftptr)
 954 {
 955     __m256d t1, t2, t3, t4;
 956     __m128d tz3, tA, tB, tC, tD;
 957
 958     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 959     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 960     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 961     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 962     fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
 963
 964     /* Add across the two lanes by swapping and adding back */
 965     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1);                                       /* X2a-b Z1a-b | Y1a-b X1a-b */
 966     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1);                                       /* X2c-d Z1c-d | Y1c-d X1c-d */
 967     t1   = _mm256_add_pd(t1, t2);                                                      /* x2 z1 | y1 x1 */
 968
 969     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3);                                       /* Y3a-b X3a-b | Z2a-b Y2a-b */
 970     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3);                                       /* Y3c-d X3c-d | Z2c-d Y2c-d */
 971     t3   = _mm256_add_pd(t3, t4);                                                      /* y3 x3 | z2 y2 */
 972
 973     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
 974
 975     t2   = _mm256_loadu_pd(fptr);
 976     t4   = _mm256_loadu_pd(fptr+4);
 977     tA   = _mm_load_sd(fptr+8);
 978
 979     t2   = _mm256_add_pd(t2, t1);
 980     t4   = _mm256_add_pd(t4, t3);
 981     tA   = _mm_add_sd(tA, tz3);
 982
 983     _mm256_storeu_pd(fptr, t2);
 984     _mm256_storeu_pd(fptr+4, t4);
 985     _mm_store_sd(fptr+8, tA);
 986
 987     /* Add up shift force */
 988     /* t1:   x2 z1 | y1 x1 */
 989     /* t3:   y3 x3 | z2 y2 */
 990     /* tz3:           0 z3 */
 991
 992     /* z component */
 993     tB   = _mm256_extractf128_pd(t1, 0x1);                                     /* x2 z1 */
 994     tC   = _mm256_extractf128_pd(t3, 0x1);                                     /* y3 x3 */
 995     tz3  = _mm_add_sd(tz3, tB);                                                /* 0  z1+z3 */
 996     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
 997     tz3  = _mm_add_sd(tz3, tD);                                                /* - z */
 998
 999     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t1));                         /* y1+y3 x1+x3 */
1000
1001     tD   = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
1002     tC   = _mm_add_pd(tC, tD);                                                 /* y x */
1003
1004     tA   = _mm_loadu_pd(fshiftptr);
1005     tB   = _mm_load_sd(fshiftptr+2);
1006     tA   = _mm_add_pd(tA, tC);
1007     tB   = _mm_add_sd(tB, tz3);
1008     _mm_storeu_pd(fshiftptr, tA);
1009     _mm_store_sd(fshiftptr+2, tB);
1010 }
1011 #endif
1012
1013
1014 #if defined (_MSC_VER) && defined(_M_IX86)
1015 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1016 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1017                                                  fptr, fshiftptr) \
1018     { \
1019         __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
1020         __m128d _tA, _tB, _tC, _tD; \
1021         fix1  = _mm256_hadd_pd(fix1, fiy1); \
1022         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
1023         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
1024         fix3  = _mm256_hadd_pd(fix3, fiy3); \
1025         fiz3  = _mm256_hadd_pd(fiz3, fix4); \
1026         fiy4  = _mm256_hadd_pd(fiy4, fiz4); \
1027         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
1028         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
1029         _t1   = _mm256_add_pd(_t1, _t2); \
1030         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
1031         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
1032         _t3   = _mm256_add_pd(_t3, _t4); \
1033         _t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
1034         _t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
1035         _t5   = _mm256_add_pd(_t5, _t6); \
1036         _t2   = _mm256_loadu_pd(fptr); \
1037         _t4   = _mm256_loadu_pd(fptr+4); \
1038         _t6   = _mm256_loadu_pd(fptr+8); \
1039         _t2   = _mm256_add_pd(_t2, _t1); \
1040         _t4   = _mm256_add_pd(_t4, _t3); \
1041         _t6   = _mm256_add_pd(_t6, _t5); \
1042         _mm256_storeu_pd(fptr, _t2); \
1043         _mm256_storeu_pd(fptr+4, _t4); \
1044         _mm256_storeu_pd(fptr+8, _t6); \
1045         _tA   = _mm256_extractf128_pd(_t1, 0x1); \
1046         _tB   = _mm256_extractf128_pd(_t3, 0x1); \
1047         _tC   = _mm256_extractf128_pd(_t5, 0x1); \
1048         _tB   = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
1049         _tA   = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
1050         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
1051         _tD   = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
1052         _tB   = _mm_add_pd(_tB, _tD); \
1053         _tC   = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
1054         _tC   = _mm_add_sd(_tC, _tA); \
1055         _tA   = _mm_loadu_pd(fshiftptr); \
1056         _tD   = _mm_load_sd(fshiftptr+2); \
1057         _tA   = _mm_add_pd(_tA, _tB); \
1058         _tD   = _mm_add_sd(_tD, _tC); \
1059         _mm_storeu_pd(fshiftptr, _tA); \
1060         _mm_store_sd(fshiftptr+2, _tD); \
1061     }
1062 #else
1063 /* Real function for sane compilers */
1064 static gmx_inline void
1065 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1066                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1067                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
1068                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
1069                                          double * gmx_restrict fptr,
1070                                          double * gmx_restrict fshiftptr)
1071 {
1072     __m256d t1, t2, t3, t4, t5, t6;
1073     __m128d tA, tB, tC, tD;
1074
1075     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1076     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1077     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1078     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1079     fiz3 = _mm256_hadd_pd(fiz3, fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
1080     fiy4 = _mm256_hadd_pd(fiy4, fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
1081
1082     /* Add across the two lanes by swapping and adding back */
1083     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1084     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1085     t1   = _mm256_add_pd(t1, t2);                /* x2 z1 | y1 x1 */
1086
1087     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1088     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1089     t3   = _mm256_add_pd(t3, t4);                /* y3 x3 | z2 y2 */
1090
1091     t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1092     t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1093     t5   = _mm256_add_pd(t5, t6);                /* z4 y4 | x4 z3 */
1094
1095     t2   = _mm256_loadu_pd(fptr);
1096     t4   = _mm256_loadu_pd(fptr+4);
1097     t6   = _mm256_loadu_pd(fptr+8);
1098
1099     t2   = _mm256_add_pd(t2, t1);
1100     t4   = _mm256_add_pd(t4, t3);
1101     t6   = _mm256_add_pd(t6, t5);
1102
1103     _mm256_storeu_pd(fptr, t2);
1104     _mm256_storeu_pd(fptr+4, t4);
1105     _mm256_storeu_pd(fptr+8, t6);
1106
1107     /* Add up shift force  */
1108     /* t1:   x2. z1. | y1. x1. */
1109     /* t3:   y3. x3. | z2 y2 */
1110     /* t5:   z4 y4 | x4. z3. */
1111
1112     /* z component */
1113     tA   = _mm256_extractf128_pd(t1, 0x1);                /* x2 z1 */
1114     tB   = _mm256_extractf128_pd(t3, 0x1);                /* y3 x3 */
1115     tC   = _mm256_extractf128_pd(t5, 0x1);                /* z4 y4 */
1116
1117     tB   = _mm_add_pd(tB, _mm256_castpd256_pd128(t1));    /*  y1+y3  x1+x3 */
1118     tA   = _mm_add_pd(tA, _mm256_castpd256_pd128(t5));    /*  x2+x4  z1+z3 */
1119     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t3));    /*  z4+z2  y4+y2 */
1120
1121     tD   = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1));    /* y4+y2 x2+x4 */
1122     tB   = _mm_add_pd(tB, tD);                            /* y x */
1123     tC   = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /*    - z4+z2 */
1124     tC   = _mm_add_sd(tC, tA);                            /* - z */
1125
1126     tA   = _mm_loadu_pd(fshiftptr);
1127     tD   = _mm_load_sd(fshiftptr+2);
1128     tA   = _mm_add_pd(tA, tB);
1129     tD   = _mm_add_sd(tD, tC);
1130     _mm_storeu_pd(fshiftptr, tA);
1131     _mm_store_sd(fshiftptr+2, tD);
1132 }
1133 #endif
1134
1135
1136
1137 static void
1138 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1139 {
1140     __m128d t1;
1141
1142     pot1 = _mm256_hadd_pd(pot1, pot1);
1143
1144     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1145
1146     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1147 }
1148
1149 static void
1150 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1151                          __m256d pot2, double * gmx_restrict ptrB)
1152 {
1153     __m128d t1, t2;
1154
1155     pot1 = _mm256_hadd_pd(pot1, pot2);
1156
1157     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1158
1159     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
1160     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1161     _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
1162 }
1163
1164
1165 #endif /* _kernelutil_x86_avx_256_double_h_ */