src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_256_double_h_
  30 #define _kernelutil_x86_avx_256_double_h_
  31
  32
  33 #include "gmx_x86_avx_256.h"
  34
  35
  36 static int
  37 gmx_mm256_any_lt(__m256d a, __m256d b)
  38 {
  39     return _mm256_movemask_pd(_mm256_cmp_pd(a,b,_CMP_LT_OQ));
  40 }
  41
  42 static gmx_inline __m256d
  43 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  44 {
  45     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx,dx), _mm256_mul_pd(dy,dy) ), _mm256_mul_pd(dz,dz) );
  46 }
  47
  48 /* Normal sum of four ymm registers */
  49 #define gmx_mm256_sum4_pd(t0,t1,t2,t3)  _mm256_add_pd(_mm256_add_pd(t0,t1),_mm256_add_pd(t2,t3))
  50
  51
  52 /* Load a single value from 1-4 places, merge into xmm register */
  53 static __m256d
  54 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
  55 {
  56     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
  57 }
  58
  59 static __m256d
  60 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  61                                 const double * gmx_restrict ptrB)
  62 {
  63     __m128d tA,tB;
  64
  65     tA = _mm_load_sd(ptrA);
  66     tB = _mm_load_sd(ptrB);
  67
  68     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA,tB));
  69 }
  70
  71
  72 static __m256d
  73 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
  74                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
  75 {
  76     __m128d t1,t2;
  77
  78     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
  79     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
  80     return gmx_mm256_set_m128d(t2,t1);
  81 }
  82
  83
  84
  85 static void
  86 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  87 {
  88     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
  89 }
  90
  91
  92 static void
  93 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
  94 {
  95     __m256d t2;
  96
  97     t2       = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
  98     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
  99     _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
 100 }
 101
 102
 103
 104
 105 static void
 106 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 107                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 108 {
 109     __m256d t2;
 110     __m128d t3,t4;
 111
 112     t2       = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
 113     t3       = _mm256_extractf128_pd(xmm1,0x1);
 114     t4       = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
 115     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
 116     _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
 117     _mm_store_sd(ptrC,t3);
 118     _mm_store_sd(ptrD,t4);
 119 }
 120
 121
 122
 123
 124 static void
 125 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 126 {
 127     __m128d t1;
 128
 129     t1   = _mm256_castpd256_pd128(xmm1);
 130     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 131
 132     _mm_store_sd(ptrA,t1);
 133 }
 134
 135
 136 static void
 137 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 138 {
 139     __m128d t1,t2;
 140
 141     t1   = _mm256_castpd256_pd128(xmm1);
 142     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
 143
 144     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 145     t2   = _mm_add_sd(t2,_mm_load_sd(ptrB));
 146
 147     _mm_store_sd(ptrA,t1);
 148     _mm_store_sd(ptrB,t2);
 149 }
 150
 151
 152 static void
 153 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 154                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 155 {
 156     __m128d t1,t2,t3,t4;
 157
 158     t1   = _mm256_castpd256_pd128(xmm1);
 159     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
 160     t3   = _mm256_extractf128_pd(xmm1,0x1);
 161     t4   = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
 162
 163     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 164     t2   = _mm_add_sd(t2,_mm_load_sd(ptrB));
 165     t3   = _mm_add_sd(t3,_mm_load_sd(ptrC));
 166     t4   = _mm_add_sd(t4,_mm_load_sd(ptrD));
 167
 168     _mm_store_sd(ptrA,t1);
 169     _mm_store_sd(ptrB,t2);
 170     _mm_store_sd(ptrC,t3);
 171     _mm_store_sd(ptrD,t4);
 172 }
 173
 174
 175
 176 static void
 177 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 178 {
 179     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 180     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 181 }
 182
 183
 184 static void
 185 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 186 {
 187     __m128d t1,t2,t3;
 188
 189     t1   = _mm_loadu_pd(p1);
 190     t2   = _mm_loadu_pd(p2);
 191     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1,t2));
 192     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1,t2));
 193 }
 194
 195
 196
 197 static void
 198 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 199                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 200                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 201 {
 202     __m256d t1,t2;
 203
 204     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 205     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 206
 207     *c6  = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
 208     *c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
 209 }
 210
 211
 212 static gmx_inline void
 213 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 214         const double * gmx_restrict xyz,
 215         __m256d * gmx_restrict x1,
 216         __m256d * gmx_restrict y1,
 217         __m256d * gmx_restrict z1)
 218 {
 219     __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
 220
 221     mem_xy  = _mm_loadu_pd(xyz);
 222     mem_z   = _mm_load_sd(xyz+2);
 223     mem_sxy = _mm_loadu_pd(xyz_shift);
 224     mem_sz  = _mm_load_sd(xyz_shift+2);
 225
 226     mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
 227     mem_z   = _mm_add_pd(mem_z,mem_sz);
 228
 229     tx  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
 230     ty  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
 231     tz  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
 232
 233     *x1 = gmx_mm256_set_m128d(tx,tx);
 234     *y1 = gmx_mm256_set_m128d(ty,ty);
 235     *z1 = gmx_mm256_set_m128d(tz,tz);
 236 }
 237
 238
 239 static gmx_inline void
 240 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 241         const double * gmx_restrict xyz,
 242         __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 243         __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 244         __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 245 {
 246     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
 247
 248     t1  = _mm_loadu_pd(xyz);
 249     t2  = _mm_loadu_pd(xyz+2);
 250     t3  = _mm_loadu_pd(xyz+4);
 251     t4  = _mm_loadu_pd(xyz+6);
 252     t5  = _mm_load_sd(xyz+8);
 253
 254     sxy = _mm_loadu_pd(xyz_shift);
 255     sz  = _mm_load_sd(xyz_shift+2);
 256     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
 257     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
 258
 259     t1  = _mm_add_pd(t1,sxy);
 260     t2  = _mm_add_pd(t2,szx);
 261     t3  = _mm_add_pd(t3,syz);
 262     t4  = _mm_add_pd(t4,sxy);
 263     t5  = _mm_add_sd(t5,sz);
 264
 265     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
 266     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
 267     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
 268     *x1 = gmx_mm256_set_m128d(tx,tx);
 269     *y1 = gmx_mm256_set_m128d(ty,ty);
 270     *z1 = gmx_mm256_set_m128d(tz,tz);
 271     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
 272     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
 273     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
 274     *x2 = gmx_mm256_set_m128d(tx,tx);
 275     *y2 = gmx_mm256_set_m128d(ty,ty);
 276     *z2 = gmx_mm256_set_m128d(tz,tz);
 277     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
 278     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
 279     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
 280     *x3 = gmx_mm256_set_m128d(tx,tx);
 281     *y3 = gmx_mm256_set_m128d(ty,ty);
 282     *z3 = gmx_mm256_set_m128d(tz,tz);
 283 }
 284
 285
 286 static gmx_inline void
 287 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 288         const double * gmx_restrict xyz,
 289         __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 290         __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 291         __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 292         __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 293 {
 294     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
 295
 296     t1  = _mm_loadu_pd(xyz);
 297     t2  = _mm_loadu_pd(xyz+2);
 298     t3  = _mm_loadu_pd(xyz+4);
 299     t4  = _mm_loadu_pd(xyz+6);
 300     t5  = _mm_loadu_pd(xyz+8);
 301     t6  = _mm_loadu_pd(xyz+10);
 302
 303     sxy = _mm_loadu_pd(xyz_shift);
 304     sz  = _mm_load_sd(xyz_shift+2);
 305     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
 306     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
 307
 308     t1  = _mm_add_pd(t1,sxy);
 309     t2  = _mm_add_pd(t2,szx);
 310     t3  = _mm_add_pd(t3,syz);
 311     t4  = _mm_add_pd(t4,sxy);
 312     t5  = _mm_add_pd(t5,szx);
 313     t6  = _mm_add_pd(t6,syz);
 314
 315     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
 316     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
 317     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
 318     *x1 = gmx_mm256_set_m128d(tx,tx);
 319     *y1 = gmx_mm256_set_m128d(ty,ty);
 320     *z1 = gmx_mm256_set_m128d(tz,tz);
 321     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
 322     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
 323     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
 324     *x2 = gmx_mm256_set_m128d(tx,tx);
 325     *y2 = gmx_mm256_set_m128d(ty,ty);
 326     *z2 = gmx_mm256_set_m128d(tz,tz);
 327     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
 328     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
 329     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
 330     *x3 = gmx_mm256_set_m128d(tx,tx);
 331     *y3 = gmx_mm256_set_m128d(ty,ty);
 332     *z3 = gmx_mm256_set_m128d(tz,tz);
 333     tx   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
 334     ty   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
 335     tz   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
 336     *x4 = gmx_mm256_set_m128d(tx,tx);
 337     *y4 = gmx_mm256_set_m128d(ty,ty);
 338     *z4 = gmx_mm256_set_m128d(tz,tz);
 339 }
 340
 341
 342 static void
 343 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 344                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 345 {
 346     __m256d t1;
 347
 348     t1            = _mm256_loadu_pd(p1);
 349     *x            = t1;
 350     *y            = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 351     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 352 }
 353
 354
 355 static void
 356 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 357                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 358                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 359                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 360 {
 361     __m256d t1,t2,t3,t4;
 362
 363     t1            = _mm256_loadu_pd(p1);
 364     t3            = _mm256_loadu_pd(p1+4);
 365     *x1           = t1;
 366     *y2           = t3;
 367     t2            = gmx_mm256_unpack128hi_pd(t1,t1);
 368     t4            = gmx_mm256_unpack128hi_pd(t3,t3);
 369     *z1           = t2;
 370     *x3           = t4;
 371     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 372     *z2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
 373     *x2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
 374     *y3           = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
 375     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 376 }
 377
 378 static void
 379 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 380                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 381                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 382                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 383                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 384 {
 385     __m256d t1,t2,t3,t4,t5,t6;
 386
 387     t1            = _mm256_loadu_pd(p1);
 388     t2            = _mm256_loadu_pd(p1+4);
 389     t3            = _mm256_loadu_pd(p1+8);
 390
 391     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 392     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
 393     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
 394
 395     *x1           = t1;
 396     *y2           = t2;
 397     *z3           = t3;
 398     *z1           = t4;
 399     *x3           = t5;
 400     *y4           = t6;
 401
 402     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 403     *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
 404     *x4           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
 405     *x2           = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
 406     *y3           = _mm256_permute_pd(t5,_GMX_MM_PERMUTE256D(0,1,0,1));
 407     *z4           = _mm256_permute_pd(t6,_GMX_MM_PERMUTE256D(0,1,0,1));
 408 }
 409
 410
 411 static void
 412 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 413                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 414                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 415 {
 416     __m256d t1,t2,t3,t4,t5,t6;
 417
 418     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 419     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 420     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 421     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 422
 423     t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 424     t6           = _mm256_unpackhi_pd(t1,t2);      /*   -   -  | y1b y1a */
 425     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1c z1c | x1d x1c */
 426     t2           = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | y1d y1c */
 427
 428     *x1          = gmx_mm256_unpack128lo_pd(t5,t1);
 429     *y1          = gmx_mm256_unpack128lo_pd(t6,t2);
 430     *z1          = gmx_mm256_unpack128hi_pd(t5,t1);
 431 }
 432
 433
 434
 435 static void
 436 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 437                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 438                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 439                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 440                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 441 {
 442     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
 443
 444     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 445     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 446     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 447     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 448     t5           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 449     t6           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 450     t7           = _mm256_loadu_pd(ptrC+4);        /*  y3c x3c | z2c y2c */
 451     t8           = _mm256_loadu_pd(ptrD+4);        /*  y3d x3d | z2d y2d */
 452     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
 453     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
 454     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8));        /*   -   -  |  -  z3c */
 455     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8));        /*   -   -  |  -  z3d */
 456
 457     t13          = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 458     t14          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 459     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
 460     t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
 461
 462     t3           = _mm256_unpacklo_pd(t5,t6);      /*  x3b x3a | y2b y2a */
 463     t4           = _mm256_unpackhi_pd(t5,t6);      /*  y3b y3a | z2b z2a */
 464     t5           = _mm256_unpacklo_pd(t7,t8);      /*  x3d x3c | y2d y2c */
 465     t6           = _mm256_unpackhi_pd(t7,t8);      /*  y3d y3c | z2d z2c */
 466
 467     t9           = _mm256_unpacklo_pd(t9,t10);     /*   -   -  | z3b z3a */
 468     t11          = _mm256_unpacklo_pd(t11,t12);    /*   -   -  | z3d z3c */
 469
 470     *x1          = gmx_mm256_unpack128lo_pd(t13,t1);
 471     *y1          = gmx_mm256_unpack128lo_pd(t14,t2);
 472     *z1          = gmx_mm256_unpack128hi_pd(t13,t1);
 473     *x2          = gmx_mm256_unpack128hi_pd(t14,t2);
 474     *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
 475     *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
 476     *x3          = gmx_mm256_unpack128hi_pd(t3,t5);
 477     *y3          = gmx_mm256_unpack128hi_pd(t4,t6);
 478     *z3          = gmx_mm256_unpack128lo_pd(t9,t11);
 479 }
 480
 481
 482
 483 static void
 484 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 485                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 486                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 487                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 488                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 489                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 490 {
 491     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
 492
 493     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 494     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 495     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 496     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 497     t5           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 498     t6           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 499     t7           = _mm256_loadu_pd(ptrC+4);        /*  y3c x3c | z2c y2c */
 500     t8           = _mm256_loadu_pd(ptrD+4);        /*  y3d x3d | z2d y2d */
 501     t9           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
 502     t10          = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
 503     t11          = _mm256_loadu_pd(ptrC+8);        /*  z4c y4c | x4c z3c */
 504     t12          = _mm256_loadu_pd(ptrD+8);        /*  z4d y4d | x4d z3d */
 505
 506     t13          = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 507     t14          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 508     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
 509     t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
 510
 511     t3           = _mm256_unpacklo_pd(t5,t6);      /*  x3b x3a | y2b y2a */
 512     t4           = _mm256_unpackhi_pd(t5,t6);      /*  y3b y3a | z2b z2a */
 513     t5           = _mm256_unpacklo_pd(t7,t8);      /*  x3d x3c | y2d y2c */
 514     t6           = _mm256_unpackhi_pd(t7,t8);      /*  y3d y3c | z2d z2c */
 515
 516     t7           = _mm256_unpacklo_pd(t9,t10);     /*  y4b y4a | z3b z3a */
 517     t8           = _mm256_unpackhi_pd(t9,t10);     /*  z4b z4a | x4b x4a */
 518     t9           = _mm256_unpacklo_pd(t11,t12);    /*  y4d y4c | z3d z3c */
 519     t10          = _mm256_unpackhi_pd(t11,t12);    /*  z4d z4c | x4d x4c */
 520
 521     *x1          = gmx_mm256_unpack128lo_pd(t13,t1);
 522     *y1          = gmx_mm256_unpack128lo_pd(t14,t2);
 523     *z1          = gmx_mm256_unpack128hi_pd(t13,t1);
 524     *x2          = gmx_mm256_unpack128hi_pd(t14,t2);
 525     *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
 526     *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
 527     *x3          = gmx_mm256_unpack128hi_pd(t3,t5);
 528     *y3          = gmx_mm256_unpack128hi_pd(t4,t6);
 529     *z3          = gmx_mm256_unpack128lo_pd(t7,t9);
 530     *x4          = gmx_mm256_unpack128lo_pd(t8,t10);
 531     *y4          = gmx_mm256_unpack128hi_pd(t7,t9);
 532     *z4          = gmx_mm256_unpack128hi_pd(t8,t10);
 533 }
 534
 535
 536
 537 static void
 538 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 539         double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 540         __m256d x1, __m256d y1, __m256d z1)
 541 {
 542     __m256d t1,t2,tA,tB,tC,tD;
 543     __m256i mask;
 544
 545     t1          = _mm256_unpacklo_pd(x1,y1); /*  y1c x1c | y1a x1a */
 546     t2          = _mm256_unpackhi_pd(x1,y1); /*  y1d x1d | y1b x1b */
 547     x1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
 548     y1          = gmx_mm256_unpack128hi_pd(t1,z1); /*  -  z1c | y1c x1c */
 549     z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(0,1,0,1));
 550     t1          = gmx_mm256_unpack128lo_pd(t2,z1); /*  -  z1b | y1b x1b */
 551     z1          = gmx_mm256_unpack128hi_pd(t2,z1); /*  -  z1d | y1d x1d */
 552
 553     /* Construct a mask without executing any data loads */
 554     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 555                                       _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
 556
 557     tA          = _mm256_loadu_pd(ptrA);
 558     tB          = _mm256_loadu_pd(ptrB);
 559     tC          = _mm256_loadu_pd(ptrC);
 560     tD          = _mm256_loadu_pd(ptrD);
 561
 562     tA          = _mm256_sub_pd(tA,x1);
 563     tB          = _mm256_sub_pd(tB,t1);
 564     tC          = _mm256_sub_pd(tC,y1);
 565     tD          = _mm256_sub_pd(tD,z1);
 566
 567     _mm256_maskstore_pd(ptrA,mask,tA);
 568     _mm256_maskstore_pd(ptrB,mask,tB);
 569     _mm256_maskstore_pd(ptrC,mask,tC);
 570     _mm256_maskstore_pd(ptrD,mask,tD);
 571 }
 572
 573
 574
 575 #if defined (_MSC_VER) && defined(_M_IX86)
 576 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 577 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
 578                                                   _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
 579 { \
 580     __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
 581     __m128d _tA,_tB,_tC,_tD,_tE;\
 582     _t1          = _mm256_loadu_pd(ptrA);\
 583     _t2          = _mm256_loadu_pd(ptrB);\
 584     _t3          = _mm256_loadu_pd(ptrC);\
 585     _t4          = _mm256_loadu_pd(ptrD);\
 586     _t5          = _mm256_loadu_pd(ptrA+4);\
 587     _t6          = _mm256_loadu_pd(ptrB+4);\
 588     _t7          = _mm256_loadu_pd(ptrC+4);\
 589     _t8          = _mm256_loadu_pd(ptrD+4);\
 590     _tA          = _mm_load_sd(ptrA+8);\
 591     _tB          = _mm_load_sd(ptrB+8);\
 592     _tC          = _mm_load_sd(ptrC+8);\
 593     _tD          = _mm_load_sd(ptrD+8);\
 594     _t9          = _mm256_unpacklo_pd(_x1,_y1);\
 595     _x1          = _mm256_unpackhi_pd(_x1,_y1);\
 596     _y1          = _mm256_unpacklo_pd(_z1,_x2);\
 597     _z1          = _mm256_unpackhi_pd(_z1,_x2);\
 598     _x2          = _mm256_unpacklo_pd(_y2,_z2);\
 599     _y2          = _mm256_unpackhi_pd(_y2,_z2);\
 600     _z2          = _mm256_unpacklo_pd(_x3,_y3);\
 601     _x3          = _mm256_unpackhi_pd(_x3,_y3);\
 602     _t10         = gmx_mm256_unpack128lo_pd(_t9,_y1);\
 603     _y3          = gmx_mm256_unpack128hi_pd(_t9,_y1);\
 604     _t9          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
 605     _y1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
 606     _x1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
 607     _z1          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
 608     _x2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
 609     _z2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
 610     _t1          = _mm256_sub_pd(_t1,_t10);\
 611     _t2          = _mm256_sub_pd(_t2,_t9);\
 612     _t3          = _mm256_sub_pd(_t3,_y3);\
 613     _t4          = _mm256_sub_pd(_t4,_y1);\
 614     _t5          = _mm256_sub_pd(_t5,_x1);\
 615     _t6          = _mm256_sub_pd(_t6,_x2);\
 616     _t7          = _mm256_sub_pd(_t7,_z1);\
 617     _t8          = _mm256_sub_pd(_t8,_z2);\
 618     _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
 619     _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
 620     _tE          = _mm256_extractf128_pd(_z3,0x1);\
 621     _tC          = _mm_sub_sd(_tC, _tE);\
 622     _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
 623     _mm256_storeu_pd(ptrA,_t1);\
 624     _mm256_storeu_pd(ptrB,_t2);\
 625     _mm256_storeu_pd(ptrC,_t3);\
 626     _mm256_storeu_pd(ptrD,_t4);\
 627     _mm256_storeu_pd(ptrA+4,_t5);\
 628     _mm256_storeu_pd(ptrB+4,_t6);\
 629     _mm256_storeu_pd(ptrC+4,_t7);\
 630     _mm256_storeu_pd(ptrD+4,_t8);\
 631     _mm_store_sd(ptrA+8,_tA);\
 632     _mm_store_sd(ptrB+8,_tB);\
 633     _mm_store_sd(ptrC+8,_tC);\
 634     _mm_store_sd(ptrD+8,_tD);\
 635 }
 636 #else
 637 /* Real function for sane compilers */
 638 static void
 639 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 640         double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 641         __m256d x1, __m256d y1, __m256d z1,
 642         __m256d x2, __m256d y2, __m256d z2,
 643         __m256d x3, __m256d y3, __m256d z3)
 644 {
 645     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
 646     __m128d tA,tB,tC,tD,tE;
 647
 648     t1          = _mm256_loadu_pd(ptrA);
 649     t2          = _mm256_loadu_pd(ptrB);
 650     t3          = _mm256_loadu_pd(ptrC);
 651     t4          = _mm256_loadu_pd(ptrD);
 652     t5          = _mm256_loadu_pd(ptrA+4);
 653     t6          = _mm256_loadu_pd(ptrB+4);
 654     t7          = _mm256_loadu_pd(ptrC+4);
 655     t8          = _mm256_loadu_pd(ptrD+4);
 656     tA          = _mm_load_sd(ptrA+8);
 657     tB          = _mm_load_sd(ptrB+8);
 658     tC          = _mm_load_sd(ptrC+8);
 659     tD          = _mm_load_sd(ptrD+8);
 660
 661     t9          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
 662     x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
 663
 664     y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
 665     z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
 666
 667     x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
 668     y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
 669
 670     z2          = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
 671     x3          = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
 672
 673     t10         = gmx_mm256_unpack128lo_pd(t9,y1);  /* x2a z1a | y1a x1a */
 674     y3          = gmx_mm256_unpack128hi_pd(t9,y1);  /* x2c z1c | y1c x1c */
 675
 676     t9          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2b z1b | y1b x1b */
 677     y1          = gmx_mm256_unpack128hi_pd(x1,z1);  /* x2d z1d | y1d x1d */
 678
 679     x1          = gmx_mm256_unpack128lo_pd(x2,z2);  /* y3a x3a | z2a y2a */
 680     z1          = gmx_mm256_unpack128hi_pd(x2,z2);  /* y3c x3c | z2c y2c */
 681
 682     x2          = gmx_mm256_unpack128lo_pd(y2,x3);  /* y3b x3b | z2b y2b */
 683     z2          = gmx_mm256_unpack128hi_pd(y2,x3);  /* y3d x3d | z2d y2d */
 684
 685     t1          = _mm256_sub_pd(t1,t10);
 686     t2          = _mm256_sub_pd(t2,t9);
 687     t3          = _mm256_sub_pd(t3,y3);
 688     t4          = _mm256_sub_pd(t4,y1);
 689     t5          = _mm256_sub_pd(t5,x1);
 690     t6          = _mm256_sub_pd(t6,x2);
 691     t7          = _mm256_sub_pd(t7,z1);
 692     t8          = _mm256_sub_pd(t8,z2);
 693
 694     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 695     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3),_GMX_MM_PERMUTE128D(1,1)));
 696     tE          = _mm256_extractf128_pd(z3,0x1);
 697     tC          = _mm_sub_sd(tC, tE);
 698     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1)));
 699
 700     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 701     _mm256_storeu_pd(ptrA,t1);
 702     _mm256_storeu_pd(ptrB,t2);
 703     _mm256_storeu_pd(ptrC,t3);
 704     _mm256_storeu_pd(ptrD,t4);
 705     _mm256_storeu_pd(ptrA+4,t5);
 706     _mm256_storeu_pd(ptrB+4,t6);
 707     _mm256_storeu_pd(ptrC+4,t7);
 708     _mm256_storeu_pd(ptrD+4,t8);
 709     _mm_store_sd(ptrA+8,tA);
 710     _mm_store_sd(ptrB+8,tB);
 711     _mm_store_sd(ptrC+8,tC);
 712     _mm_store_sd(ptrD+8,tD);
 713 }
 714 #endif
 715
 716 #if defined (_MSC_VER) && defined(_M_IX86)
 717 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 718 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
 719                                                   _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
 720 { \
 721     __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
 722     __m128d _tA,_tB,_tC,_tD,_tE;\
 723     _t1          = _mm256_loadu_pd(ptrA);\
 724     _t2          = _mm256_loadu_pd(ptrB);\
 725     _t3          = _mm256_loadu_pd(ptrC);\
 726     _t4          = _mm256_loadu_pd(ptrD);\
 727     _t5          = _mm256_loadu_pd(ptrA+4);\
 728     _t6          = _mm256_loadu_pd(ptrB+4);\
 729     _t7          = _mm256_loadu_pd(ptrC+4);\
 730     _t8          = _mm256_loadu_pd(ptrD+4);\
 731     _t9          = _mm256_loadu_pd(ptrA+8);\
 732     _t10         = _mm256_loadu_pd(ptrB+8);\
 733     _t11         = _mm256_loadu_pd(ptrC+8);\
 734     _t12         = _mm256_loadu_pd(ptrD+8);\
 735     _t13         = _mm256_unpacklo_pd(_x1,_y1);\
 736     _x1          = _mm256_unpackhi_pd(_x1,_y1);\
 737     _y1          = _mm256_unpacklo_pd(_z1,_x2);\
 738     _z1          = _mm256_unpackhi_pd(_z1,_x2);\
 739     _x2          = _mm256_unpacklo_pd(_y2,_z2);\
 740     _y2          = _mm256_unpackhi_pd(_y2,_z2);\
 741     _z2          = _mm256_unpacklo_pd(_x3,_y3);\
 742     _x3          = _mm256_unpackhi_pd(_x3,_y3);\
 743     _y3          = _mm256_unpacklo_pd(_z3,_x4);\
 744     _z3          = _mm256_unpackhi_pd(_z3,_x4);\
 745     _x4          = _mm256_unpacklo_pd(_y4,_z4);\
 746     _y4          = _mm256_unpackhi_pd(_y4,_z4);\
 747     _z4          = gmx_mm256_unpack128lo_pd(_t13,_y1);\
 748     _t13         = gmx_mm256_unpack128hi_pd(_t13,_y1);\
 749     _y1          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
 750     _x1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
 751     _z1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
 752     _x2          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
 753     _z2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
 754     _y2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
 755     _x3          = gmx_mm256_unpack128lo_pd(_y3,_x4);\
 756     _y3          = gmx_mm256_unpack128hi_pd(_y3,_x4);\
 757     _x4          = gmx_mm256_unpack128lo_pd(_z3,_y4);\
 758     _z3          = gmx_mm256_unpack128hi_pd(_z3,_y4);\
 759     _t1          = _mm256_sub_pd(_t1,_z4);\
 760     _t2          = _mm256_sub_pd(_t2,_y1);\
 761     _t3          = _mm256_sub_pd(_t3,_t13);\
 762     _t4          = _mm256_sub_pd(_t4,_x1);\
 763     _t5          = _mm256_sub_pd(_t5,_z1);\
 764     _t6          = _mm256_sub_pd(_t6,_z2);\
 765     _t7          = _mm256_sub_pd(_t7,_x2);\
 766     _t8          = _mm256_sub_pd(_t8,_y2);\
 767     _t9          = _mm256_sub_pd(_t9,_x3);\
 768     _t10         = _mm256_sub_pd(_t10,_x4);\
 769     _t11         = _mm256_sub_pd(_t11,_y3);\
 770     _t12         = _mm256_sub_pd(_t12,_z3);\
 771     _mm256_storeu_pd(ptrA,_t1);\
 772     _mm256_storeu_pd(ptrB,_t2);\
 773     _mm256_storeu_pd(ptrC,_t3);\
 774     _mm256_storeu_pd(ptrD,_t4);\
 775     _mm256_storeu_pd(ptrA+4,_t5);\
 776     _mm256_storeu_pd(ptrB+4,_t6);\
 777     _mm256_storeu_pd(ptrC+4,_t7);\
 778     _mm256_storeu_pd(ptrD+4,_t8);\
 779     _mm256_storeu_pd(ptrA+8,_t9);\
 780     _mm256_storeu_pd(ptrB+8,_t10);\
 781     _mm256_storeu_pd(ptrC+8,_t11);\
 782     _mm256_storeu_pd(ptrD+8,_t12);\
 783 }
 784 #else
 785 /* Real function for sane compilers */
 786 static void
 787 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 788         double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 789         __m256d x1, __m256d y1, __m256d z1,
 790         __m256d x2, __m256d y2, __m256d z2,
 791         __m256d x3, __m256d y3, __m256d z3,
 792         __m256d x4, __m256d y4, __m256d z4)
 793 {
 794     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
 795     __m128d tA,tB,tC,tD,tE;
 796
 797     t1          = _mm256_loadu_pd(ptrA);
 798     t2          = _mm256_loadu_pd(ptrB);
 799     t3          = _mm256_loadu_pd(ptrC);
 800     t4          = _mm256_loadu_pd(ptrD);
 801     t5          = _mm256_loadu_pd(ptrA+4);
 802     t6          = _mm256_loadu_pd(ptrB+4);
 803     t7          = _mm256_loadu_pd(ptrC+4);
 804     t8          = _mm256_loadu_pd(ptrD+4);
 805     t9          = _mm256_loadu_pd(ptrA+8);
 806     t10         = _mm256_loadu_pd(ptrB+8);
 807     t11         = _mm256_loadu_pd(ptrC+8);
 808     t12         = _mm256_loadu_pd(ptrD+8);
 809
 810     t13         = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
 811     x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
 812     y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
 813     z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
 814     x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
 815     y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
 816     z2          = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
 817     x3          = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
 818     y3          = _mm256_unpacklo_pd(z3,x4); /* x4c z3c | x4a z3a */
 819     z3          = _mm256_unpackhi_pd(z3,x4); /* x4d z3d | x4b z3b */
 820     x4          = _mm256_unpacklo_pd(y4,z4); /* z4c y4c | z4a y4a */
 821     y4          = _mm256_unpackhi_pd(y4,z4); /* z4d y4d | z4b y4b */
 822
 823     z4          = gmx_mm256_unpack128lo_pd(t13,y1); /* x2a z1a | y1a x1a */
 824     t13         = gmx_mm256_unpack128hi_pd(t13,y1); /* x2c z1c | y1c x1c */
 825     y1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2b z1b | y1b x1b */
 826     x1          = gmx_mm256_unpack128hi_pd(x1,z1);  /* x2d z1d | y1d x1d */
 827     z1          = gmx_mm256_unpack128lo_pd(x2,z2);  /* y3a x3a | z2a y2a */
 828     x2          = gmx_mm256_unpack128hi_pd(x2,z2);  /* y3c x3c | z2c y2c */
 829     z2          = gmx_mm256_unpack128lo_pd(y2,x3);  /* y3b x3b | z2b y2b */
 830     y2          = gmx_mm256_unpack128hi_pd(y2,x3);  /* y3d x3d | z2d y2d */
 831     x3          = gmx_mm256_unpack128lo_pd(y3,x4);  /* z4a y4a | x4a z3a */
 832     y3          = gmx_mm256_unpack128hi_pd(y3,x4);  /* z4c y4c | x4c z3c */
 833     x4          = gmx_mm256_unpack128lo_pd(z3,y4);  /* z4b y4b | x4b z3b */
 834     z3          = gmx_mm256_unpack128hi_pd(z3,y4);  /* z4d y4d | x4d z3d */
 835
 836     t1          = _mm256_sub_pd(t1,z4);
 837     t2          = _mm256_sub_pd(t2,y1);
 838     t3          = _mm256_sub_pd(t3,t13);
 839     t4          = _mm256_sub_pd(t4,x1);
 840     t5          = _mm256_sub_pd(t5,z1);
 841     t6          = _mm256_sub_pd(t6,z2);
 842     t7          = _mm256_sub_pd(t7,x2);
 843     t8          = _mm256_sub_pd(t8,y2);
 844     t9          = _mm256_sub_pd(t9,x3);
 845     t10         = _mm256_sub_pd(t10,x4);
 846     t11         = _mm256_sub_pd(t11,y3);
 847     t12         = _mm256_sub_pd(t12,z3);
 848
 849     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 850     _mm256_storeu_pd(ptrA,t1);
 851     _mm256_storeu_pd(ptrB,t2);
 852     _mm256_storeu_pd(ptrC,t3);
 853     _mm256_storeu_pd(ptrD,t4);
 854     _mm256_storeu_pd(ptrA+4,t5);
 855     _mm256_storeu_pd(ptrB+4,t6);
 856     _mm256_storeu_pd(ptrC+4,t7);
 857     _mm256_storeu_pd(ptrD+4,t8);
 858     _mm256_storeu_pd(ptrA+8,t9);
 859     _mm256_storeu_pd(ptrB+8,t10);
 860     _mm256_storeu_pd(ptrC+8,t11);
 861     _mm256_storeu_pd(ptrD+8,t12);
 862 }
 863 #endif
 864
 865
 866
 867
 868
 869 static gmx_inline void
 870 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 871         double * gmx_restrict fptr,
 872         double * gmx_restrict fshiftptr)
 873 {
 874     __m256d t1,t2;
 875     __m128d tA,tB;
 876     fix1 = _mm256_hadd_pd(fix1,fiy1);
 877     fiz1 = _mm256_hadd_pd(fiz1,_mm256_setzero_pd());
 878
 879     /* Add across the two lanes */
 880     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
 881     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
 882
 883     fix1 = gmx_mm256_set_m128d(tB,tA); /* 0 fiz fiy fix */
 884
 885     t1   = _mm256_loadu_pd(fptr);
 886     t2   = _mm256_loadu_pd(fshiftptr);
 887
 888     t1   = _mm256_add_pd(t1,fix1);
 889     t2   = _mm256_add_pd(t2,fix1);
 890
 891     _mm256_storeu_pd(fptr,t1);
 892     _mm256_storeu_pd(fshiftptr,t2);
 893 }
 894
 895
 896
 897 #if defined (_MSC_VER) && defined(_M_IX86)
 898 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 899 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
 900                                               fptr,fshiftptr) \
 901 { \
 902     __m256d _t1,_t2,_t3,_t4;\
 903     __m128d _tz3,_tA,_tB,_tC,_tD;\
 904     fix1 = _mm256_hadd_pd(fix1,fiy1);\
 905     fiz1 = _mm256_hadd_pd(fiz1,fix2);\
 906     fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
 907     fix3 = _mm256_hadd_pd(fix3,fiy3);\
 908     fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
 909     _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
 910     _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
 911     _t1   = _mm256_add_pd(_t1,_t2);\
 912     _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
 913     _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
 914     _t3   = _mm256_add_pd(_t3,_t4);\
 915     _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
 916     _t2   = _mm256_loadu_pd(fptr);\
 917     _t4   = _mm256_loadu_pd(fptr+4);\
 918     _tA   = _mm_load_sd(fptr+8);\
 919     _t2   = _mm256_add_pd(_t2,_t1);\
 920     _t4   = _mm256_add_pd(_t4,_t3);\
 921     _tA   = _mm_add_sd(_tA,_tz3);\
 922     _mm256_storeu_pd(fptr,_t2);\
 923     _mm256_storeu_pd(fptr+4,_t4);\
 924     _mm_store_sd(fptr+8,_tA);\
 925     _tB   = _mm256_extractf128_pd(_t1,0x1);\
 926     _tC   = _mm256_extractf128_pd(_t3,0x1);\
 927     _tz3  = _mm_add_sd(_tz3,_tB);\
 928     _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
 929     _tz3  = _mm_add_sd(_tz3,_tD);\
 930     _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
 931     _tD   = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
 932     _tC   = _mm_add_pd(_tC,_tD);\
 933     _tA   = _mm_loadu_pd(fshiftptr);\
 934     _tB   = _mm_load_sd(fshiftptr+2);\
 935     _tA   = _mm_add_pd(_tA,_tC);\
 936     _tB   = _mm_add_sd(_tB,_tz3);\
 937     _mm_storeu_pd(fshiftptr,_tA);\
 938     _mm_store_sd(fshiftptr+2,_tB);\
 939 }
 940 #else
 941 /* Real function for sane compilers */
 942 static gmx_inline void
 943 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 944         __m256d fix2, __m256d fiy2, __m256d fiz2,
 945         __m256d fix3, __m256d fiy3, __m256d fiz3,
 946         double * gmx_restrict fptr,
 947         double * gmx_restrict fshiftptr)
 948 {
 949     __m256d t1,t2,t3,t4;
 950     __m128d tz3,tA,tB,tC,tD;
 951
 952     fix1 = _mm256_hadd_pd(fix1,fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 953     fiz1 = _mm256_hadd_pd(fiz1,fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 954     fiy2 = _mm256_hadd_pd(fiy2,fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 955     fix3 = _mm256_hadd_pd(fix3,fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 956     fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
 957
 958     /* Add across the two lanes by swapping and adding back */
 959     t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
 960     t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
 961     t1   = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
 962
 963     t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
 964     t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
 965     t3   = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
 966
 967     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));  /* 0 z3 */
 968
 969     t2   = _mm256_loadu_pd(fptr);
 970     t4   = _mm256_loadu_pd(fptr+4);
 971     tA   = _mm_load_sd(fptr+8);
 972
 973     t2   = _mm256_add_pd(t2,t1);
 974     t4   = _mm256_add_pd(t4,t3);
 975     tA   = _mm_add_sd(tA,tz3);
 976
 977     _mm256_storeu_pd(fptr,t2);
 978     _mm256_storeu_pd(fptr+4,t4);
 979     _mm_store_sd(fptr+8,tA);
 980
 981     /* Add up shift force */
 982     /* t1:   x2 z1 | y1 x1 */
 983     /* t3:   y3 x3 | z2 y2 */
 984     /* tz3:           0 z3 */
 985
 986     /* z component */
 987     tB   = _mm256_extractf128_pd(t1,0x1);  /* x2 z1 */
 988     tC   = _mm256_extractf128_pd(t3,0x1);  /* y3 x3 */
 989     tz3  = _mm_add_sd(tz3,tB);             /* 0  z1+z3 */
 990     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3),_GMX_MM_PERMUTE128D(1,1));
 991     tz3  = _mm_add_sd(tz3,tD); /* - z */
 992
 993     tC   = _mm_add_pd(tC,_mm256_castpd256_pd128(t1));  /* y1+y3 x1+x3 */
 994
 995     tD   = _mm_shuffle_pd(tB,_mm256_castpd256_pd128(t3),_MM_SHUFFLE2(0,1)); /* y2 x2 */
 996     tC   = _mm_add_pd(tC,tD); /* y x */
 997
 998     tA   = _mm_loadu_pd(fshiftptr);
 999     tB   = _mm_load_sd(fshiftptr+2);
1000     tA   = _mm_add_pd(tA,tC);
1001     tB   = _mm_add_sd(tB,tz3);
1002     _mm_storeu_pd(fshiftptr,tA);
1003     _mm_store_sd(fshiftptr+2,tB);
1004 }
1005 #endif
1006
1007
1008 #if defined (_MSC_VER) && defined(_M_IX86)
1009 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1010 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
1011                                               fptr,fshiftptr) \
1012 {\
1013     __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
1014     __m128d _tA,_tB,_tC,_tD;\
1015     fix1 = _mm256_hadd_pd(fix1,fiy1);\
1016     fiz1 = _mm256_hadd_pd(fiz1,fix2);\
1017     fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
1018     fix3 = _mm256_hadd_pd(fix3,fiy3);\
1019     fiz3 = _mm256_hadd_pd(fiz3,fix4);\
1020     fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
1021     _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
1022     _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
1023     _t1   = _mm256_add_pd(_t1,_t2);\
1024     _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
1025     _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
1026     _t3   = _mm256_add_pd(_t3,_t4);\
1027     _t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
1028     _t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
1029     _t5   = _mm256_add_pd(_t5,_t6);\
1030     _t2   = _mm256_loadu_pd(fptr);\
1031     _t4   = _mm256_loadu_pd(fptr+4);\
1032     _t6   = _mm256_loadu_pd(fptr+8);\
1033     _t2   = _mm256_add_pd(_t2,_t1);\
1034     _t4   = _mm256_add_pd(_t4,_t3);\
1035     _t6   = _mm256_add_pd(_t6,_t5);\
1036     _mm256_storeu_pd(fptr,_t2);\
1037     _mm256_storeu_pd(fptr+4,_t4);\
1038     _mm256_storeu_pd(fptr+8,_t6);\
1039     _tA   = _mm256_extractf128_pd(_t1,0x1);\
1040     _tB   = _mm256_extractf128_pd(_t3,0x1);\
1041     _tC   = _mm256_extractf128_pd(_t5,0x1);\
1042     _tB   = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
1043     _tA   = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
1044     _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
1045     _tD   = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
1046     _tB   = _mm_add_pd(_tB,_tD);\
1047     _tC   = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
1048     _tC   = _mm_add_sd(_tC,_tA);\
1049     _tA   = _mm_loadu_pd(fshiftptr);\
1050     _tD   = _mm_load_sd(fshiftptr+2);\
1051     _tA   = _mm_add_pd(_tA,_tB);\
1052     _tD   = _mm_add_sd(_tD,_tC);\
1053     _mm_storeu_pd(fshiftptr,_tA);\
1054     _mm_store_sd(fshiftptr+2,_tD);\
1055 }
1056 #else
1057 /* Real function for sane compilers */
1058 static gmx_inline void
1059 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1060         __m256d fix2, __m256d fiy2, __m256d fiz2,
1061         __m256d fix3, __m256d fiy3, __m256d fiz3,
1062         __m256d fix4, __m256d fiy4, __m256d fiz4,
1063         double * gmx_restrict fptr,
1064         double * gmx_restrict fshiftptr)
1065 {
1066     __m256d t1,t2,t3,t4,t5,t6;
1067     __m128d tA,tB,tC,tD;
1068
1069     fix1 = _mm256_hadd_pd(fix1,fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1070     fiz1 = _mm256_hadd_pd(fiz1,fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1071     fiy2 = _mm256_hadd_pd(fiy2,fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1072     fix3 = _mm256_hadd_pd(fix3,fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1073     fiz3 = _mm256_hadd_pd(fiz3,fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
1074     fiy4 = _mm256_hadd_pd(fiy4,fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
1075
1076     /* Add across the two lanes by swapping and adding back */
1077     t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1078     t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1079     t1   = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
1080
1081     t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1082     t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1083     t3   = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
1084
1085     t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1086     t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1087     t5   = _mm256_add_pd(t5,t6); /* z4 y4 | x4 z3 */
1088
1089     t2   = _mm256_loadu_pd(fptr);
1090     t4   = _mm256_loadu_pd(fptr+4);
1091     t6   = _mm256_loadu_pd(fptr+8);
1092
1093     t2   = _mm256_add_pd(t2,t1);
1094     t4   = _mm256_add_pd(t4,t3);
1095     t6   = _mm256_add_pd(t6,t5);
1096
1097     _mm256_storeu_pd(fptr,t2);
1098     _mm256_storeu_pd(fptr+4,t4);
1099     _mm256_storeu_pd(fptr+8,t6);
1100
1101     /* Add up shift force  */
1102     /* t1:   x2. z1. | y1. x1. */
1103     /* t3:   y3. x3. | z2 y2 */
1104     /* t5:   z4 y4 | x4. z3. */
1105
1106     /* z component */
1107     tA   = _mm256_extractf128_pd(t1,0x1);  /* x2 z1 */
1108     tB   = _mm256_extractf128_pd(t3,0x1);  /* y3 x3 */
1109     tC   = _mm256_extractf128_pd(t5,0x1);  /* z4 y4 */
1110
1111     tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));  /*  y1+y3  x1+x3 */
1112     tA   = _mm_add_pd(tA,_mm256_castpd256_pd128(t5));  /*  x2+x4  z1+z3 */
1113     tC   = _mm_add_pd(tC,_mm256_castpd256_pd128(t3));  /*  z4+z2  y4+y2 */
1114
1115     tD   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* y4+y2 x2+x4 */
1116     tB   = _mm_add_pd(tB,tD); /* y x */
1117     tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /*    - z4+z2 */
1118     tC   = _mm_add_sd(tC,tA); /* - z */
1119
1120     tA   = _mm_loadu_pd(fshiftptr);
1121     tD   = _mm_load_sd(fshiftptr+2);
1122     tA   = _mm_add_pd(tA,tB);
1123     tD   = _mm_add_sd(tD,tC);
1124     _mm_storeu_pd(fshiftptr,tA);
1125     _mm_store_sd(fshiftptr+2,tD);
1126 }
1127 #endif
1128
1129
1130
1131 static void
1132 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1133 {
1134     __m128d t1;
1135
1136     pot1 = _mm256_hadd_pd(pot1,pot1);
1137
1138     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1139
1140     _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1141 }
1142
1143 static void
1144 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1145                          __m256d pot2, double * gmx_restrict ptrB)
1146 {
1147     __m128d t1,t2;
1148
1149     pot1 = _mm256_hadd_pd(pot1,pot2);
1150
1151     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1152
1153     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
1154     _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1155     _mm_store_sd(ptrB,_mm_add_sd(_mm_load_sd(ptrB),t2));
1156 }
1157
1158
1159 #endif /* _kernelutil_x86_avx_256_double_h_ */