src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_256_double_h_
  30 #define _kernelutil_x86_avx_256_double_h_
  31
  32
  33 #include "gmx_x86_avx_256.h"
  34
  35
  36 static int
  37 gmx_mm256_any_lt(__m256d a, __m256d b)
  38 {
  39     return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  40 }
  41
  42 static gmx_inline __m256d
  43 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  44 {
  45     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
  46 }
  47
  48 /* Normal sum of four ymm registers */
  49 #define gmx_mm256_sum4_pd(t0, t1, t2, t3)  _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
  50
  51
  52 /* Load a single value from 1-4 places, merge into xmm register */
  53 static __m256d
  54 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
  55 {
  56     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
  57 }
  58
  59 static __m256d
  60 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  61                                 const double * gmx_restrict ptrB)
  62 {
  63     __m128d tA, tB;
  64
  65     tA = _mm_load_sd(ptrA);
  66     tB = _mm_load_sd(ptrB);
  67
  68     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
  69 }
  70
  71
  72 static __m256d
  73 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
  74                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
  75 {
  76     __m128d t1, t2;
  77
  78     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  79     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
  80     return gmx_mm256_set_m128d(t2, t1);
  81 }
  82
  83
  84
  85 static void
  86 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  87 {
  88     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
  89 }
  90
  91
  92 static void
  93 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
  94 {
  95     __m256d t2;
  96
  97     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
  98     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
  99     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 100 }
 101
 102
 103
 104
 105 static void
 106 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 107                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 108 {
 109     __m256d t2;
 110     __m128d t3, t4;
 111
 112     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 113     t3       = _mm256_extractf128_pd(xmm1, 0x1);
 114     t4       = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 115     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 116     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 117     _mm_store_sd(ptrC, t3);
 118     _mm_store_sd(ptrD, t4);
 119 }
 120
 121
 122
 123
 124 static void
 125 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 126 {
 127     __m128d t1;
 128
 129     t1   = _mm256_castpd256_pd128(xmm1);
 130     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 131
 132     _mm_store_sd(ptrA, t1);
 133 }
 134
 135
 136 static void
 137 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 138 {
 139     __m128d t1, t2;
 140
 141     t1   = _mm256_castpd256_pd128(xmm1);
 142     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 143
 144     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 145     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 146
 147     _mm_store_sd(ptrA, t1);
 148     _mm_store_sd(ptrB, t2);
 149 }
 150
 151
 152 static void
 153 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 154                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 155 {
 156     __m128d t1, t2, t3, t4;
 157
 158     t1   = _mm256_castpd256_pd128(xmm1);
 159     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 160     t3   = _mm256_extractf128_pd(xmm1, 0x1);
 161     t4   = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 162
 163     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 164     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 165     t3   = _mm_add_sd(t3, _mm_load_sd(ptrC));
 166     t4   = _mm_add_sd(t4, _mm_load_sd(ptrD));
 167
 168     _mm_store_sd(ptrA, t1);
 169     _mm_store_sd(ptrB, t2);
 170     _mm_store_sd(ptrC, t3);
 171     _mm_store_sd(ptrD, t4);
 172 }
 173
 174
 175
 176 static void
 177 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 178 {
 179     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 180     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 181 }
 182
 183
 184 static void
 185 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 186 {
 187     __m128d t1, t2, t3;
 188
 189     t1   = _mm_loadu_pd(p1);
 190     t2   = _mm_loadu_pd(p2);
 191     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
 192     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
 193 }
 194
 195
 196
 197 static void
 198 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 199                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 200                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 201 {
 202     __m256d t1, t2;
 203
 204     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 205     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 206
 207     *c6  = _mm256_unpacklo_pd(t1, t2);                              /* c6d c6c | c6b c6a */
 208     *c12 = _mm256_unpackhi_pd(t1, t2);                              /* c12d c12c | c12b c12a */
 209 }
 210
 211
 212 static gmx_inline void
 213 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 214                                             const double * gmx_restrict xyz,
 215                                             __m256d * gmx_restrict      x1,
 216                                             __m256d * gmx_restrict      y1,
 217                                             __m256d * gmx_restrict      z1)
 218 {
 219     __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
 220
 221     mem_xy  = _mm_loadu_pd(xyz);
 222     mem_z   = _mm_load_sd(xyz+2);
 223     mem_sxy = _mm_loadu_pd(xyz_shift);
 224     mem_sz  = _mm_load_sd(xyz_shift+2);
 225
 226     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 227     mem_z   = _mm_add_pd(mem_z, mem_sz);
 228
 229     tx  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 230     ty  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 231     tz  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 232
 233     *x1 = gmx_mm256_set_m128d(tx, tx);
 234     *y1 = gmx_mm256_set_m128d(ty, ty);
 235     *z1 = gmx_mm256_set_m128d(tz, tz);
 236 }
 237
 238
 239 static gmx_inline void
 240 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 241                                             const double * gmx_restrict xyz,
 242                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 243                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 244                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 245 {
 246     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
 247
 248     t1  = _mm_loadu_pd(xyz);
 249     t2  = _mm_loadu_pd(xyz+2);
 250     t3  = _mm_loadu_pd(xyz+4);
 251     t4  = _mm_loadu_pd(xyz+6);
 252     t5  = _mm_load_sd(xyz+8);
 253
 254     sxy = _mm_loadu_pd(xyz_shift);
 255     sz  = _mm_load_sd(xyz_shift+2);
 256     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 257     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 258
 259     t1  = _mm_add_pd(t1, sxy);
 260     t2  = _mm_add_pd(t2, szx);
 261     t3  = _mm_add_pd(t3, syz);
 262     t4  = _mm_add_pd(t4, sxy);
 263     t5  = _mm_add_sd(t5, sz);
 264
 265     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 266     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 267     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 268     *x1  = gmx_mm256_set_m128d(tx, tx);
 269     *y1  = gmx_mm256_set_m128d(ty, ty);
 270     *z1  = gmx_mm256_set_m128d(tz, tz);
 271     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 272     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 273     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 274     *x2  = gmx_mm256_set_m128d(tx, tx);
 275     *y2  = gmx_mm256_set_m128d(ty, ty);
 276     *z2  = gmx_mm256_set_m128d(tz, tz);
 277     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 278     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 279     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 280     *x3  = gmx_mm256_set_m128d(tx, tx);
 281     *y3  = gmx_mm256_set_m128d(ty, ty);
 282     *z3  = gmx_mm256_set_m128d(tz, tz);
 283 }
 284
 285
 286 static gmx_inline void
 287 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 288                                             const double * gmx_restrict xyz,
 289                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 290                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 291                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 292                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 293 {
 294     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
 295
 296     t1  = _mm_loadu_pd(xyz);
 297     t2  = _mm_loadu_pd(xyz+2);
 298     t3  = _mm_loadu_pd(xyz+4);
 299     t4  = _mm_loadu_pd(xyz+6);
 300     t5  = _mm_loadu_pd(xyz+8);
 301     t6  = _mm_loadu_pd(xyz+10);
 302
 303     sxy = _mm_loadu_pd(xyz_shift);
 304     sz  = _mm_load_sd(xyz_shift+2);
 305     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 306     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 307
 308     t1  = _mm_add_pd(t1, sxy);
 309     t2  = _mm_add_pd(t2, szx);
 310     t3  = _mm_add_pd(t3, syz);
 311     t4  = _mm_add_pd(t4, sxy);
 312     t5  = _mm_add_pd(t5, szx);
 313     t6  = _mm_add_pd(t6, syz);
 314
 315     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 316     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 317     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 318     *x1  = gmx_mm256_set_m128d(tx, tx);
 319     *y1  = gmx_mm256_set_m128d(ty, ty);
 320     *z1  = gmx_mm256_set_m128d(tz, tz);
 321     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 322     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 323     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 324     *x2  = gmx_mm256_set_m128d(tx, tx);
 325     *y2  = gmx_mm256_set_m128d(ty, ty);
 326     *z2  = gmx_mm256_set_m128d(tz, tz);
 327     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 328     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 329     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 330     *x3  = gmx_mm256_set_m128d(tx, tx);
 331     *y3  = gmx_mm256_set_m128d(ty, ty);
 332     *z3  = gmx_mm256_set_m128d(tz, tz);
 333     tx   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 334     ty   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 335     tz   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 336     *x4  = gmx_mm256_set_m128d(tx, tx);
 337     *y4  = gmx_mm256_set_m128d(ty, ty);
 338     *z4  = gmx_mm256_set_m128d(tz, tz);
 339 }
 340
 341
 342 static void
 343 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 344                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 345 {
 346     __m256d t1;
 347
 348     t1            = _mm256_loadu_pd(p1);
 349     *x            = t1;
 350     *y            = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 351     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 352 }
 353
 354
 355 static void
 356 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 357                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 358                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 359                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 360 {
 361     __m256d t1, t2, t3, t4;
 362
 363     t1            = _mm256_loadu_pd(p1);
 364     t3            = _mm256_loadu_pd(p1+4);
 365     *x1           = t1;
 366     *y2           = t3;
 367     t2            = gmx_mm256_unpack128hi_pd(t1, t1);
 368     t4            = gmx_mm256_unpack128hi_pd(t3, t3);
 369     *z1           = t2;
 370     *x3           = t4;
 371     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 372     *z2           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 373     *x2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 374     *y3           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 375     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 376 }
 377
 378 static void
 379 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 380                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 381                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 382                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 383                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 384 {
 385     __m256d t1, t2, t3, t4, t5, t6;
 386
 387     t1            = _mm256_loadu_pd(p1);
 388     t2            = _mm256_loadu_pd(p1+4);
 389     t3            = _mm256_loadu_pd(p1+8);
 390
 391     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 392     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
 393     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
 394
 395     *x1           = t1;
 396     *y2           = t2;
 397     *z3           = t3;
 398     *z1           = t4;
 399     *x3           = t5;
 400     *y4           = t6;
 401
 402     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 403     *z2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 404     *x4           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 405     *x2           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 406     *y3           = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 407     *z4           = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 408 }
 409
 410
 411 static void
 412 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 413                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 414                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 415 {
 416     __m256d t1, t2, t3, t4, t5, t6;
 417
 418     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 419     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 420     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 421     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 422
 423     t5           = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 424     t6           = _mm256_unpackhi_pd(t1, t2);   /*   -   -  | y1b y1a */
 425     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1c z1c | x1d x1c */
 426     t2           = _mm256_unpackhi_pd(t3, t4);   /*   -   -  | y1d y1c */
 427
 428     *x1          = gmx_mm256_unpack128lo_pd(t5, t1);
 429     *y1          = gmx_mm256_unpack128lo_pd(t6, t2);
 430     *z1          = gmx_mm256_unpack128hi_pd(t5, t1);
 431 }
 432
 433
 434
 435 static void
 436 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 437                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 438                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 439                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 440                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 441 {
 442     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 443
 444     t1           = _mm256_loadu_pd(ptrA);                       /*  x2a z1a | y1a x1a */
 445     t2           = _mm256_loadu_pd(ptrB);                       /*  x2b z1b | y1b x1b */
 446     t3           = _mm256_loadu_pd(ptrC);                       /*  x2c z1c | y1c x1c */
 447     t4           = _mm256_loadu_pd(ptrD);                       /*  x2d z1d | y1d x1d */
 448     t5           = _mm256_loadu_pd(ptrA+4);                     /*  y3a x3a | z2a y2a */
 449     t6           = _mm256_loadu_pd(ptrB+4);                     /*  y3b x3b | z2b y2b */
 450     t7           = _mm256_loadu_pd(ptrC+4);                     /*  y3c x3c | z2c y2c */
 451     t8           = _mm256_loadu_pd(ptrD+4);                     /*  y3d x3d | z2d y2d */
 452     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /*   -   -  |  -  z3a */
 453     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /*   -   -  |  -  z3b */
 454     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /*   -   -  |  -  z3c */
 455     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /*   -   -  |  -  z3d */
 456
 457     t13          = _mm256_unpacklo_pd(t1, t2);                  /*  z1b z1a | x1b x1a */
 458     t14          = _mm256_unpackhi_pd(t1, t2);                  /*  x2b x2a | y1b y1a */
 459     t1           = _mm256_unpacklo_pd(t3, t4);                  /*  z1d z1c | x1d x1c */
 460     t2           = _mm256_unpackhi_pd(t3, t4);                  /*  x2d x2c | y1d y1c */
 461
 462     t3           = _mm256_unpacklo_pd(t5, t6);                  /*  x3b x3a | y2b y2a */
 463     t4           = _mm256_unpackhi_pd(t5, t6);                  /*  y3b y3a | z2b z2a */
 464     t5           = _mm256_unpacklo_pd(t7, t8);                  /*  x3d x3c | y2d y2c */
 465     t6           = _mm256_unpackhi_pd(t7, t8);                  /*  y3d y3c | z2d z2c */
 466
 467     t9           = _mm256_unpacklo_pd(t9, t10);                 /*   -   -  | z3b z3a */
 468     t11          = _mm256_unpacklo_pd(t11, t12);                /*   -   -  | z3d z3c */
 469
 470     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 471     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 472     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 473     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 474     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 475     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 476     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 477     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 478     *z3          = gmx_mm256_unpack128lo_pd(t9, t11);
 479 }
 480
 481
 482
 483 static void
 484 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 485                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 486                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 487                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 488                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 489                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 490 {
 491     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 492
 493     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 494     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 495     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 496     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 497     t5           = _mm256_loadu_pd(ptrA+4);      /*  y3a x3a | z2a y2a */
 498     t6           = _mm256_loadu_pd(ptrB+4);      /*  y3b x3b | z2b y2b */
 499     t7           = _mm256_loadu_pd(ptrC+4);      /*  y3c x3c | z2c y2c */
 500     t8           = _mm256_loadu_pd(ptrD+4);      /*  y3d x3d | z2d y2d */
 501     t9           = _mm256_loadu_pd(ptrA+8);      /*  z4a y4a | x4a z3a */
 502     t10          = _mm256_loadu_pd(ptrB+8);      /*  z4b y4b | x4b z3b */
 503     t11          = _mm256_loadu_pd(ptrC+8);      /*  z4c y4c | x4c z3c */
 504     t12          = _mm256_loadu_pd(ptrD+8);      /*  z4d y4d | x4d z3d */
 505
 506     t13          = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 507     t14          = _mm256_unpackhi_pd(t1, t2);   /*  x2b x2a | y1b y1a */
 508     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1d z1c | x1d x1c */
 509     t2           = _mm256_unpackhi_pd(t3, t4);   /*  x2d x2c | y1d y1c */
 510
 511     t3           = _mm256_unpacklo_pd(t5, t6);   /*  x3b x3a | y2b y2a */
 512     t4           = _mm256_unpackhi_pd(t5, t6);   /*  y3b y3a | z2b z2a */
 513     t5           = _mm256_unpacklo_pd(t7, t8);   /*  x3d x3c | y2d y2c */
 514     t6           = _mm256_unpackhi_pd(t7, t8);   /*  y3d y3c | z2d z2c */
 515
 516     t7           = _mm256_unpacklo_pd(t9, t10);  /*  y4b y4a | z3b z3a */
 517     t8           = _mm256_unpackhi_pd(t9, t10);  /*  z4b z4a | x4b x4a */
 518     t9           = _mm256_unpacklo_pd(t11, t12); /*  y4d y4c | z3d z3c */
 519     t10          = _mm256_unpackhi_pd(t11, t12); /*  z4d z4c | x4d x4c */
 520
 521     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 522     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 523     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 524     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 525     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 526     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 527     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 528     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 529     *z3          = gmx_mm256_unpack128lo_pd(t7, t9);
 530     *x4          = gmx_mm256_unpack128lo_pd(t8, t10);
 531     *y4          = gmx_mm256_unpack128hi_pd(t7, t9);
 532     *z4          = gmx_mm256_unpack128hi_pd(t8, t10);
 533 }
 534
 535
 536
 537 static void
 538 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 539                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 540                                           __m256d x1, __m256d y1, __m256d z1)
 541 {
 542     __m256d t1, t2, tA, tB, tC, tD;
 543     __m256i mask;
 544
 545     t1          = _mm256_unpacklo_pd(x1, y1);       /*  y1c x1c | y1a x1a */
 546     t2          = _mm256_unpackhi_pd(x1, y1);       /*  y1d x1d | y1b x1b */
 547     x1          = gmx_mm256_unpack128lo_pd(t1, z1); /*  -  z1a | y1a x1a */
 548     y1          = gmx_mm256_unpack128hi_pd(t1, z1); /*  -  z1c | y1c x1c */
 549     z1          = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 550     t1          = gmx_mm256_unpack128lo_pd(t2, z1); /*  -  z1b | y1b x1b */
 551     z1          = gmx_mm256_unpack128hi_pd(t2, z1); /*  -  z1d | y1d x1d */
 552
 553     /* Construct a mask without executing any data loads */
 554     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 555                                                       _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
 556
 557     tA          = _mm256_loadu_pd(ptrA);
 558     tB          = _mm256_loadu_pd(ptrB);
 559     tC          = _mm256_loadu_pd(ptrC);
 560     tD          = _mm256_loadu_pd(ptrD);
 561
 562     tA          = _mm256_sub_pd(tA, x1);
 563     tB          = _mm256_sub_pd(tB, t1);
 564     tC          = _mm256_sub_pd(tC, y1);
 565     tD          = _mm256_sub_pd(tD, z1);
 566
 567     _mm256_maskstore_pd(ptrA, mask, tA);
 568     _mm256_maskstore_pd(ptrB, mask, tB);
 569     _mm256_maskstore_pd(ptrC, mask, tC);
 570     _mm256_maskstore_pd(ptrD, mask, tD);
 571 }
 572
 573
 574
 575 #if defined (_MSC_VER) && defined(_M_IX86)
 576 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 577 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 578                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 579     { \
 580         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
 581         __m128d _tA, _tB, _tC, _tD, _tE; \
 582         _t1          = _mm256_loadu_pd(ptrA); \
 583         _t2          = _mm256_loadu_pd(ptrB); \
 584         _t3          = _mm256_loadu_pd(ptrC); \
 585         _t4          = _mm256_loadu_pd(ptrD); \
 586         _t5          = _mm256_loadu_pd(ptrA+4); \
 587         _t6          = _mm256_loadu_pd(ptrB+4); \
 588         _t7          = _mm256_loadu_pd(ptrC+4); \
 589         _t8          = _mm256_loadu_pd(ptrD+4); \
 590         _tA          = _mm_load_sd(ptrA+8); \
 591         _tB          = _mm_load_sd(ptrB+8); \
 592         _tC          = _mm_load_sd(ptrC+8); \
 593         _tD          = _mm_load_sd(ptrD+8); \
 594         _t9          = _mm256_unpacklo_pd(_x1, _y1); \
 595         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 596         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 597         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 598         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 599         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 600         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 601         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 602         _t10         = gmx_mm256_unpack128lo_pd(_t9, _y1); \
 603         _y3          = gmx_mm256_unpack128hi_pd(_t9, _y1); \
 604         _t9          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 605         _y1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 606         _x1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 607         _z1          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 608         _x2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 609         _z2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 610         _t1          = _mm256_sub_pd(_t1, _t10); \
 611         _t2          = _mm256_sub_pd(_t2, _t9); \
 612         _t3          = _mm256_sub_pd(_t3, _y3); \
 613         _t4          = _mm256_sub_pd(_t4, _y1); \
 614         _t5          = _mm256_sub_pd(_t5, _x1); \
 615         _t6          = _mm256_sub_pd(_t6, _x2); \
 616         _t7          = _mm256_sub_pd(_t7, _z1); \
 617         _t8          = _mm256_sub_pd(_t8, _z2); \
 618         _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
 619         _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
 620         _tE          = _mm256_extractf128_pd(_z3, 0x1); \
 621         _tC          = _mm_sub_sd(_tC, _tE); \
 622         _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
 623         _mm256_storeu_pd(ptrA, _t1); \
 624         _mm256_storeu_pd(ptrB, _t2); \
 625         _mm256_storeu_pd(ptrC, _t3); \
 626         _mm256_storeu_pd(ptrD, _t4); \
 627         _mm256_storeu_pd(ptrA+4, _t5); \
 628         _mm256_storeu_pd(ptrB+4, _t6); \
 629         _mm256_storeu_pd(ptrC+4, _t7); \
 630         _mm256_storeu_pd(ptrD+4, _t8); \
 631         _mm_store_sd(ptrA+8, _tA); \
 632         _mm_store_sd(ptrB+8, _tB); \
 633         _mm_store_sd(ptrC+8, _tC); \
 634         _mm_store_sd(ptrD+8, _tD); \
 635     }
 636 #else
 637 /* Real function for sane compilers */
 638 static void
 639 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 640                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 641                                           __m256d x1, __m256d y1, __m256d z1,
 642                                           __m256d x2, __m256d y2, __m256d z2,
 643                                           __m256d x3, __m256d y3, __m256d z3)
 644 {
 645     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 646     __m128d tA, tB, tC, tD, tE;
 647
 648     t1          = _mm256_loadu_pd(ptrA);
 649     t2          = _mm256_loadu_pd(ptrB);
 650     t3          = _mm256_loadu_pd(ptrC);
 651     t4          = _mm256_loadu_pd(ptrD);
 652     t5          = _mm256_loadu_pd(ptrA+4);
 653     t6          = _mm256_loadu_pd(ptrB+4);
 654     t7          = _mm256_loadu_pd(ptrC+4);
 655     t8          = _mm256_loadu_pd(ptrD+4);
 656     tA          = _mm_load_sd(ptrA+8);
 657     tB          = _mm_load_sd(ptrB+8);
 658     tC          = _mm_load_sd(ptrC+8);
 659     tD          = _mm_load_sd(ptrD+8);
 660
 661     t9          = _mm256_unpacklo_pd(x1, y1);       /* y1c x1c | y1a x1a */
 662     x1          = _mm256_unpackhi_pd(x1, y1);       /* y1d x1d | y1b x1b */
 663
 664     y1          = _mm256_unpacklo_pd(z1, x2);       /* x2c z1c | x2a z1a */
 665     z1          = _mm256_unpackhi_pd(z1, x2);       /* x2d z1d | x2b z1b */
 666
 667     x2          = _mm256_unpacklo_pd(y2, z2);       /* z2c y2c | z2a y2a */
 668     y2          = _mm256_unpackhi_pd(y2, z2);       /* z2d y2d | z2b y2b */
 669
 670     z2          = _mm256_unpacklo_pd(x3, y3);       /* y3c x3c | y3a x3a */
 671     x3          = _mm256_unpackhi_pd(x3, y3);       /* y3d x3d | y3b x3b */
 672
 673     t10         = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
 674     y3          = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
 675
 676     t9          = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
 677     y1          = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
 678
 679     x1          = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
 680     z1          = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
 681
 682     x2          = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
 683     z2          = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
 684
 685     t1          = _mm256_sub_pd(t1, t10);
 686     t2          = _mm256_sub_pd(t2, t9);
 687     t3          = _mm256_sub_pd(t3, y3);
 688     t4          = _mm256_sub_pd(t4, y1);
 689     t5          = _mm256_sub_pd(t5, x1);
 690     t6          = _mm256_sub_pd(t6, x2);
 691     t7          = _mm256_sub_pd(t7, z1);
 692     t8          = _mm256_sub_pd(t8, z2);
 693
 694     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 695     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
 696     tE          = _mm256_extractf128_pd(z3, 0x1);
 697     tC          = _mm_sub_sd(tC, tE);
 698     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
 699
 700     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 701     _mm256_storeu_pd(ptrA, t1);
 702     _mm256_storeu_pd(ptrB, t2);
 703     _mm256_storeu_pd(ptrC, t3);
 704     _mm256_storeu_pd(ptrD, t4);
 705     _mm256_storeu_pd(ptrA+4, t5);
 706     _mm256_storeu_pd(ptrB+4, t6);
 707     _mm256_storeu_pd(ptrC+4, t7);
 708     _mm256_storeu_pd(ptrD+4, t8);
 709     _mm_store_sd(ptrA+8, tA);
 710     _mm_store_sd(ptrB+8, tB);
 711     _mm_store_sd(ptrC+8, tC);
 712     _mm_store_sd(ptrD+8, tD);
 713 }
 714 #endif
 715
 716 #if defined (_MSC_VER) && defined(_M_IX86)
 717 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 718 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
 719                                                   _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
 720     { \
 721         __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
 722         __m128d _tA, _tB, _tC, _tD, _tE; \
 723         _t1          = _mm256_loadu_pd(ptrA); \
 724         _t2          = _mm256_loadu_pd(ptrB); \
 725         _t3          = _mm256_loadu_pd(ptrC); \
 726         _t4          = _mm256_loadu_pd(ptrD); \
 727         _t5          = _mm256_loadu_pd(ptrA+4); \
 728         _t6          = _mm256_loadu_pd(ptrB+4); \
 729         _t7          = _mm256_loadu_pd(ptrC+4); \
 730         _t8          = _mm256_loadu_pd(ptrD+4); \
 731         _t9          = _mm256_loadu_pd(ptrA+8); \
 732         _t10         = _mm256_loadu_pd(ptrB+8); \
 733         _t11         = _mm256_loadu_pd(ptrC+8); \
 734         _t12         = _mm256_loadu_pd(ptrD+8); \
 735         _t13         = _mm256_unpacklo_pd(_x1, _y1); \
 736         _x1          = _mm256_unpackhi_pd(_x1, _y1); \
 737         _y1          = _mm256_unpacklo_pd(_z1, _x2); \
 738         _z1          = _mm256_unpackhi_pd(_z1, _x2); \
 739         _x2          = _mm256_unpacklo_pd(_y2, _z2); \
 740         _y2          = _mm256_unpackhi_pd(_y2, _z2); \
 741         _z2          = _mm256_unpacklo_pd(_x3, _y3); \
 742         _x3          = _mm256_unpackhi_pd(_x3, _y3); \
 743         _y3          = _mm256_unpacklo_pd(_z3, _x4); \
 744         _z3          = _mm256_unpackhi_pd(_z3, _x4); \
 745         _x4          = _mm256_unpacklo_pd(_y4, _z4); \
 746         _y4          = _mm256_unpackhi_pd(_y4, _z4); \
 747         _z4          = gmx_mm256_unpack128lo_pd(_t13, _y1); \
 748         _t13         = gmx_mm256_unpack128hi_pd(_t13, _y1); \
 749         _y1          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
 750         _x1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
 751         _z1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
 752         _x2          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
 753         _z2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
 754         _y2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
 755         _x3          = gmx_mm256_unpack128lo_pd(_y3, _x4); \
 756         _y3          = gmx_mm256_unpack128hi_pd(_y3, _x4); \
 757         _x4          = gmx_mm256_unpack128lo_pd(_z3, _y4); \
 758         _z3          = gmx_mm256_unpack128hi_pd(_z3, _y4); \
 759         _t1          = _mm256_sub_pd(_t1, _z4); \
 760         _t2          = _mm256_sub_pd(_t2, _y1); \
 761         _t3          = _mm256_sub_pd(_t3, _t13); \
 762         _t4          = _mm256_sub_pd(_t4, _x1); \
 763         _t5          = _mm256_sub_pd(_t5, _z1); \
 764         _t6          = _mm256_sub_pd(_t6, _z2); \
 765         _t7          = _mm256_sub_pd(_t7, _x2); \
 766         _t8          = _mm256_sub_pd(_t8, _y2); \
 767         _t9          = _mm256_sub_pd(_t9, _x3); \
 768         _t10         = _mm256_sub_pd(_t10, _x4); \
 769         _t11         = _mm256_sub_pd(_t11, _y3); \
 770         _t12         = _mm256_sub_pd(_t12, _z3); \
 771         _mm256_storeu_pd(ptrA, _t1); \
 772         _mm256_storeu_pd(ptrB, _t2); \
 773         _mm256_storeu_pd(ptrC, _t3); \
 774         _mm256_storeu_pd(ptrD, _t4); \
 775         _mm256_storeu_pd(ptrA+4, _t5); \
 776         _mm256_storeu_pd(ptrB+4, _t6); \
 777         _mm256_storeu_pd(ptrC+4, _t7); \
 778         _mm256_storeu_pd(ptrD+4, _t8); \
 779         _mm256_storeu_pd(ptrA+8, _t9); \
 780         _mm256_storeu_pd(ptrB+8, _t10); \
 781         _mm256_storeu_pd(ptrC+8, _t11); \
 782         _mm256_storeu_pd(ptrD+8, _t12); \
 783     }
 784 #else
 785 /* Real function for sane compilers */
 786 static void
 787 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 788                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 789                                           __m256d x1, __m256d y1, __m256d z1,
 790                                           __m256d x2, __m256d y2, __m256d z2,
 791                                           __m256d x3, __m256d y3, __m256d z3,
 792                                           __m256d x4, __m256d y4, __m256d z4)
 793 {
 794     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 795     __m128d tA, tB, tC, tD, tE;
 796
 797     t1          = _mm256_loadu_pd(ptrA);
 798     t2          = _mm256_loadu_pd(ptrB);
 799     t3          = _mm256_loadu_pd(ptrC);
 800     t4          = _mm256_loadu_pd(ptrD);
 801     t5          = _mm256_loadu_pd(ptrA+4);
 802     t6          = _mm256_loadu_pd(ptrB+4);
 803     t7          = _mm256_loadu_pd(ptrC+4);
 804     t8          = _mm256_loadu_pd(ptrD+4);
 805     t9          = _mm256_loadu_pd(ptrA+8);
 806     t10         = _mm256_loadu_pd(ptrB+8);
 807     t11         = _mm256_loadu_pd(ptrC+8);
 808     t12         = _mm256_loadu_pd(ptrD+8);
 809
 810     t13         = _mm256_unpacklo_pd(x1, y1);        /* y1c x1c | y1a x1a */
 811     x1          = _mm256_unpackhi_pd(x1, y1);        /* y1d x1d | y1b x1b */
 812     y1          = _mm256_unpacklo_pd(z1, x2);        /* x2c z1c | x2a z1a */
 813     z1          = _mm256_unpackhi_pd(z1, x2);        /* x2d z1d | x2b z1b */
 814     x2          = _mm256_unpacklo_pd(y2, z2);        /* z2c y2c | z2a y2a */
 815     y2          = _mm256_unpackhi_pd(y2, z2);        /* z2d y2d | z2b y2b */
 816     z2          = _mm256_unpacklo_pd(x3, y3);        /* y3c x3c | y3a x3a */
 817     x3          = _mm256_unpackhi_pd(x3, y3);        /* y3d x3d | y3b x3b */
 818     y3          = _mm256_unpacklo_pd(z3, x4);        /* x4c z3c | x4a z3a */
 819     z3          = _mm256_unpackhi_pd(z3, x4);        /* x4d z3d | x4b z3b */
 820     x4          = _mm256_unpacklo_pd(y4, z4);        /* z4c y4c | z4a y4a */
 821     y4          = _mm256_unpackhi_pd(y4, z4);        /* z4d y4d | z4b y4b */
 822
 823     z4          = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
 824     t13         = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
 825     y1          = gmx_mm256_unpack128lo_pd(x1, z1);  /* x2b z1b | y1b x1b */
 826     x1          = gmx_mm256_unpack128hi_pd(x1, z1);  /* x2d z1d | y1d x1d */
 827     z1          = gmx_mm256_unpack128lo_pd(x2, z2);  /* y3a x3a | z2a y2a */
 828     x2          = gmx_mm256_unpack128hi_pd(x2, z2);  /* y3c x3c | z2c y2c */
 829     z2          = gmx_mm256_unpack128lo_pd(y2, x3);  /* y3b x3b | z2b y2b */
 830     y2          = gmx_mm256_unpack128hi_pd(y2, x3);  /* y3d x3d | z2d y2d */
 831     x3          = gmx_mm256_unpack128lo_pd(y3, x4);  /* z4a y4a | x4a z3a */
 832     y3          = gmx_mm256_unpack128hi_pd(y3, x4);  /* z4c y4c | x4c z3c */
 833     x4          = gmx_mm256_unpack128lo_pd(z3, y4);  /* z4b y4b | x4b z3b */
 834     z3          = gmx_mm256_unpack128hi_pd(z3, y4);  /* z4d y4d | x4d z3d */
 835
 836     t1          = _mm256_sub_pd(t1, z4);
 837     t2          = _mm256_sub_pd(t2, y1);
 838     t3          = _mm256_sub_pd(t3, t13);
 839     t4          = _mm256_sub_pd(t4, x1);
 840     t5          = _mm256_sub_pd(t5, z1);
 841     t6          = _mm256_sub_pd(t6, z2);
 842     t7          = _mm256_sub_pd(t7, x2);
 843     t8          = _mm256_sub_pd(t8, y2);
 844     t9          = _mm256_sub_pd(t9, x3);
 845     t10         = _mm256_sub_pd(t10, x4);
 846     t11         = _mm256_sub_pd(t11, y3);
 847     t12         = _mm256_sub_pd(t12, z3);
 848
 849     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 850     _mm256_storeu_pd(ptrA, t1);
 851     _mm256_storeu_pd(ptrB, t2);
 852     _mm256_storeu_pd(ptrC, t3);
 853     _mm256_storeu_pd(ptrD, t4);
 854     _mm256_storeu_pd(ptrA+4, t5);
 855     _mm256_storeu_pd(ptrB+4, t6);
 856     _mm256_storeu_pd(ptrC+4, t7);
 857     _mm256_storeu_pd(ptrD+4, t8);
 858     _mm256_storeu_pd(ptrA+8, t9);
 859     _mm256_storeu_pd(ptrB+8, t10);
 860     _mm256_storeu_pd(ptrC+8, t11);
 861     _mm256_storeu_pd(ptrD+8, t12);
 862 }
 863 #endif
 864
 865
 866
 867
 868
 869 static gmx_inline void
 870 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 871                                          double * gmx_restrict fptr,
 872                                          double * gmx_restrict fshiftptr)
 873 {
 874     __m256d t1, t2;
 875     __m128d tA, tB;
 876     fix1 = _mm256_hadd_pd(fix1, fiy1);
 877     fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
 878
 879     /* Add across the two lanes */
 880     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
 881     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
 882
 883     fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
 884
 885     t1   = _mm256_loadu_pd(fptr);
 886     t2   = _mm256_loadu_pd(fshiftptr);
 887
 888     t1   = _mm256_add_pd(t1, fix1);
 889     t2   = _mm256_add_pd(t2, fix1);
 890
 891     _mm256_storeu_pd(fptr, t1);
 892     _mm256_storeu_pd(fshiftptr, t2);
 893 }
 894
 895
 896
 897 #if defined (_MSC_VER) && defined(_M_IX86)
 898 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 899 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
 900                                                  fptr, fshiftptr) \
 901     { \
 902         __m256d _t1, _t2, _t3, _t4; \
 903         __m128d _tz3, _tA, _tB, _tC, _tD; \
 904         fix1  = _mm256_hadd_pd(fix1, fiy1); \
 905         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
 906         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
 907         fix3  = _mm256_hadd_pd(fix3, fiy3); \
 908         fiz3  = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
 909         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
 910         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
 911         _t1   = _mm256_add_pd(_t1, _t2); \
 912         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
 913         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
 914         _t3   = _mm256_add_pd(_t3, _t4); \
 915         _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
 916         _t2   = _mm256_loadu_pd(fptr); \
 917         _t4   = _mm256_loadu_pd(fptr+4); \
 918         _tA   = _mm_load_sd(fptr+8); \
 919         _t2   = _mm256_add_pd(_t2, _t1); \
 920         _t4   = _mm256_add_pd(_t4, _t3); \
 921         _tA   = _mm_add_sd(_tA, _tz3); \
 922         _mm256_storeu_pd(fptr, _t2); \
 923         _mm256_storeu_pd(fptr+4, _t4); \
 924         _mm_store_sd(fptr+8, _tA); \
 925         _tB   = _mm256_extractf128_pd(_t1, 0x1); \
 926         _tC   = _mm256_extractf128_pd(_t3, 0x1); \
 927         _tz3  = _mm_add_sd(_tz3, _tB); \
 928         _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
 929         _tz3  = _mm_add_sd(_tz3, _tD); \
 930         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
 931         _tD   = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
 932         _tC   = _mm_add_pd(_tC, _tD); \
 933         _tA   = _mm_loadu_pd(fshiftptr); \
 934         _tB   = _mm_load_sd(fshiftptr+2); \
 935         _tA   = _mm_add_pd(_tA, _tC); \
 936         _tB   = _mm_add_sd(_tB, _tz3); \
 937         _mm_storeu_pd(fshiftptr, _tA); \
 938         _mm_store_sd(fshiftptr+2, _tB); \
 939     }
 940 #else
 941 /* Real function for sane compilers */
 942 static gmx_inline void
 943 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 944                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 945                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 946                                          double * gmx_restrict fptr,
 947                                          double * gmx_restrict fshiftptr)
 948 {
 949     __m256d t1, t2, t3, t4;
 950     __m128d tz3, tA, tB, tC, tD;
 951
 952     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 953     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 954     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 955     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 956     fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
 957
 958     /* Add across the two lanes by swapping and adding back */
 959     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1);                                       /* X2a-b Z1a-b | Y1a-b X1a-b */
 960     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1);                                       /* X2c-d Z1c-d | Y1c-d X1c-d */
 961     t1   = _mm256_add_pd(t1, t2);                                                      /* x2 z1 | y1 x1 */
 962
 963     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3);                                       /* Y3a-b X3a-b | Z2a-b Y2a-b */
 964     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3);                                       /* Y3c-d X3c-d | Z2c-d Y2c-d */
 965     t3   = _mm256_add_pd(t3, t4);                                                      /* y3 x3 | z2 y2 */
 966
 967     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
 968
 969     t2   = _mm256_loadu_pd(fptr);
 970     t4   = _mm256_loadu_pd(fptr+4);
 971     tA   = _mm_load_sd(fptr+8);
 972
 973     t2   = _mm256_add_pd(t2, t1);
 974     t4   = _mm256_add_pd(t4, t3);
 975     tA   = _mm_add_sd(tA, tz3);
 976
 977     _mm256_storeu_pd(fptr, t2);
 978     _mm256_storeu_pd(fptr+4, t4);
 979     _mm_store_sd(fptr+8, tA);
 980
 981     /* Add up shift force */
 982     /* t1:   x2 z1 | y1 x1 */
 983     /* t3:   y3 x3 | z2 y2 */
 984     /* tz3:           0 z3 */
 985
 986     /* z component */
 987     tB   = _mm256_extractf128_pd(t1, 0x1);                                     /* x2 z1 */
 988     tC   = _mm256_extractf128_pd(t3, 0x1);                                     /* y3 x3 */
 989     tz3  = _mm_add_sd(tz3, tB);                                                /* 0  z1+z3 */
 990     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
 991     tz3  = _mm_add_sd(tz3, tD);                                                /* - z */
 992
 993     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t1));                         /* y1+y3 x1+x3 */
 994
 995     tD   = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
 996     tC   = _mm_add_pd(tC, tD);                                                 /* y x */
 997
 998     tA   = _mm_loadu_pd(fshiftptr);
 999     tB   = _mm_load_sd(fshiftptr+2);
1000     tA   = _mm_add_pd(tA, tC);
1001     tB   = _mm_add_sd(tB, tz3);
1002     _mm_storeu_pd(fshiftptr, tA);
1003     _mm_store_sd(fshiftptr+2, tB);
1004 }
1005 #endif
1006
1007
1008 #if defined (_MSC_VER) && defined(_M_IX86)
1009 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1010 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1011                                                  fptr, fshiftptr) \
1012     { \
1013         __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
1014         __m128d _tA, _tB, _tC, _tD; \
1015         fix1  = _mm256_hadd_pd(fix1, fiy1); \
1016         fiz1  = _mm256_hadd_pd(fiz1, fix2); \
1017         fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
1018         fix3  = _mm256_hadd_pd(fix3, fiy3); \
1019         fiz3  = _mm256_hadd_pd(fiz3, fix4); \
1020         fiy4  = _mm256_hadd_pd(fiy4, fiz4); \
1021         _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
1022         _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
1023         _t1   = _mm256_add_pd(_t1, _t2); \
1024         _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
1025         _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
1026         _t3   = _mm256_add_pd(_t3, _t4); \
1027         _t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
1028         _t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
1029         _t5   = _mm256_add_pd(_t5, _t6); \
1030         _t2   = _mm256_loadu_pd(fptr); \
1031         _t4   = _mm256_loadu_pd(fptr+4); \
1032         _t6   = _mm256_loadu_pd(fptr+8); \
1033         _t2   = _mm256_add_pd(_t2, _t1); \
1034         _t4   = _mm256_add_pd(_t4, _t3); \
1035         _t6   = _mm256_add_pd(_t6, _t5); \
1036         _mm256_storeu_pd(fptr, _t2); \
1037         _mm256_storeu_pd(fptr+4, _t4); \
1038         _mm256_storeu_pd(fptr+8, _t6); \
1039         _tA   = _mm256_extractf128_pd(_t1, 0x1); \
1040         _tB   = _mm256_extractf128_pd(_t3, 0x1); \
1041         _tC   = _mm256_extractf128_pd(_t5, 0x1); \
1042         _tB   = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
1043         _tA   = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
1044         _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
1045         _tD   = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
1046         _tB   = _mm_add_pd(_tB, _tD); \
1047         _tC   = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
1048         _tC   = _mm_add_sd(_tC, _tA); \
1049         _tA   = _mm_loadu_pd(fshiftptr); \
1050         _tD   = _mm_load_sd(fshiftptr+2); \
1051         _tA   = _mm_add_pd(_tA, _tB); \
1052         _tD   = _mm_add_sd(_tD, _tC); \
1053         _mm_storeu_pd(fshiftptr, _tA); \
1054         _mm_store_sd(fshiftptr+2, _tD); \
1055     }
1056 #else
1057 /* Real function for sane compilers */
1058 static gmx_inline void
1059 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1060                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1061                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
1062                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
1063                                          double * gmx_restrict fptr,
1064                                          double * gmx_restrict fshiftptr)
1065 {
1066     __m256d t1, t2, t3, t4, t5, t6;
1067     __m128d tA, tB, tC, tD;
1068
1069     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1070     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1071     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1072     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1073     fiz3 = _mm256_hadd_pd(fiz3, fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
1074     fiy4 = _mm256_hadd_pd(fiy4, fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
1075
1076     /* Add across the two lanes by swapping and adding back */
1077     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1078     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1079     t1   = _mm256_add_pd(t1, t2);                /* x2 z1 | y1 x1 */
1080
1081     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1082     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1083     t3   = _mm256_add_pd(t3, t4);                /* y3 x3 | z2 y2 */
1084
1085     t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1086     t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1087     t5   = _mm256_add_pd(t5, t6);                /* z4 y4 | x4 z3 */
1088
1089     t2   = _mm256_loadu_pd(fptr);
1090     t4   = _mm256_loadu_pd(fptr+4);
1091     t6   = _mm256_loadu_pd(fptr+8);
1092
1093     t2   = _mm256_add_pd(t2, t1);
1094     t4   = _mm256_add_pd(t4, t3);
1095     t6   = _mm256_add_pd(t6, t5);
1096
1097     _mm256_storeu_pd(fptr, t2);
1098     _mm256_storeu_pd(fptr+4, t4);
1099     _mm256_storeu_pd(fptr+8, t6);
1100
1101     /* Add up shift force  */
1102     /* t1:   x2. z1. | y1. x1. */
1103     /* t3:   y3. x3. | z2 y2 */
1104     /* t5:   z4 y4 | x4. z3. */
1105
1106     /* z component */
1107     tA   = _mm256_extractf128_pd(t1, 0x1);                /* x2 z1 */
1108     tB   = _mm256_extractf128_pd(t3, 0x1);                /* y3 x3 */
1109     tC   = _mm256_extractf128_pd(t5, 0x1);                /* z4 y4 */
1110
1111     tB   = _mm_add_pd(tB, _mm256_castpd256_pd128(t1));    /*  y1+y3  x1+x3 */
1112     tA   = _mm_add_pd(tA, _mm256_castpd256_pd128(t5));    /*  x2+x4  z1+z3 */
1113     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t3));    /*  z4+z2  y4+y2 */
1114
1115     tD   = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1));    /* y4+y2 x2+x4 */
1116     tB   = _mm_add_pd(tB, tD);                            /* y x */
1117     tC   = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /*    - z4+z2 */
1118     tC   = _mm_add_sd(tC, tA);                            /* - z */
1119
1120     tA   = _mm_loadu_pd(fshiftptr);
1121     tD   = _mm_load_sd(fshiftptr+2);
1122     tA   = _mm_add_pd(tA, tB);
1123     tD   = _mm_add_sd(tD, tC);
1124     _mm_storeu_pd(fshiftptr, tA);
1125     _mm_store_sd(fshiftptr+2, tD);
1126 }
1127 #endif
1128
1129
1130
1131 static void
1132 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1133 {
1134     __m128d t1;
1135
1136     pot1 = _mm256_hadd_pd(pot1, pot1);
1137
1138     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1139
1140     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1141 }
1142
1143 static void
1144 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1145                          __m256d pot2, double * gmx_restrict ptrB)
1146 {
1147     __m128d t1, t2;
1148
1149     pot1 = _mm256_hadd_pd(pot1, pot2);
1150
1151     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1152
1153     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
1154     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1155     _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
1156 }
1157
1158
1159 #endif /* _kernelutil_x86_avx_256_double_h_ */