src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_256_double_h_
  30 #define _kernelutil_x86_avx_256_double_h_
  31
  32
  33 #include "gmx_x86_avx_256.h"
  34
  35
  36 static int
  37 gmx_mm256_any_lt(__m256d a, __m256d b)
  38 {
  39     return _mm256_movemask_pd(_mm256_cmp_pd(a,b,_CMP_LT_OQ));
  40 }
  41
  42 static gmx_inline __m256d
  43 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  44 {
  45     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx,dx), _mm256_mul_pd(dy,dy) ), _mm256_mul_pd(dz,dz) );
  46 }
  47
  48 /* Normal sum of four ymm registers */
  49 #define gmx_mm256_sum4_pd(t0,t1,t2,t3)  _mm256_add_pd(_mm256_add_pd(t0,t1),_mm256_add_pd(t2,t3))
  50
  51
  52 /* Load a single value from 1-4 places, merge into xmm register */
  53 static __m256d
  54 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
  55 {
  56     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
  57 }
  58
  59 static __m256d
  60 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  61                                 const double * gmx_restrict ptrB)
  62 {
  63     __m128d tA,tB;
  64
  65     tA = _mm_load_sd(ptrA);
  66     tB = _mm_load_sd(ptrB);
  67
  68     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA,tB));
  69 }
  70
  71
  72 static __m256d
  73 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
  74                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
  75 {
  76     __m128d t1,t2;
  77
  78     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
  79     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
  80     return gmx_mm256_set_m128(t2,t1);
  81 }
  82
  83
  84
  85 static void
  86 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  87 {
  88     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
  89 }
  90
  91
  92 static void
  93 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
  94 {
  95     __m256d t2;
  96
  97     t2       = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
  98     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
  99     _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
 100 }
 101
 102
 103
 104
 105 static void
 106 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 107                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 108 {
 109     __m256d t2;
 110     __m128d t3,t4;
 111
 112     t2       = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
 113     t3       = _mm256_extractf128_pd(xmm1,0x1);
 114     t4       = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
 115     _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
 116     _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
 117     _mm_store_sd(ptrC,t3);
 118     _mm_store_sd(ptrD,t4);
 119 }
 120
 121
 122
 123
 124 static void
 125 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 126 {
 127     __m128d t1;
 128
 129     t1   = _mm256_castpd256_pd128(xmm1);
 130     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 131
 132     _mm_store_sd(ptrA,t1);
 133 }
 134
 135
 136 static void
 137 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 138 {
 139     __m128d t1,t2;
 140
 141     t1   = _mm256_castpd256_pd128(xmm1);
 142     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
 143
 144     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 145     t2   = _mm_add_sd(t2,_mm_load_sd(ptrB));
 146
 147     _mm_store_sd(ptrA,t1);
 148     _mm_store_sd(ptrB,t2);
 149 }
 150
 151
 152 static void
 153 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 154                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 155 {
 156     __m128d t1,t2,t3,t4;
 157
 158     t1   = _mm256_castpd256_pd128(xmm1);
 159     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
 160     t3   = _mm256_extractf128_pd(xmm1,0x1);
 161     t4   = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
 162
 163     t1   = _mm_add_sd(t1,_mm_load_sd(ptrA));
 164     t2   = _mm_add_sd(t2,_mm_load_sd(ptrB));
 165     t3   = _mm_add_sd(t3,_mm_load_sd(ptrC));
 166     t4   = _mm_add_sd(t4,_mm_load_sd(ptrD));
 167
 168     _mm_store_sd(ptrA,t1);
 169     _mm_store_sd(ptrB,t2);
 170     _mm_store_sd(ptrC,t3);
 171     _mm_store_sd(ptrD,t4);
 172 }
 173
 174
 175
 176 static void
 177 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 178 {
 179     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 180     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 181 }
 182
 183
 184 static void
 185 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 186 {
 187     __m128d t1,t2,t3;
 188
 189     t1   = _mm_loadu_pd(p1);
 190     t2   = _mm_loadu_pd(p2);
 191     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1,t2));
 192     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1,t2));
 193 }
 194
 195
 196
 197 static void
 198 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 199                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 200                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 201 {
 202     __m256d t1,t2;
 203
 204     t1   = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 205     t2   = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 206
 207     *c6  = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
 208     *c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
 209 }
 210
 211
 212 static gmx_inline void
 213 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 214                                             const double * gmx_restrict xyz,
 215                                             __m256d * gmx_restrict x1,
 216                                             __m256d * gmx_restrict y1,
 217                                             __m256d * gmx_restrict z1)
 218 {
 219     __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
 220
 221     mem_xy  = _mm_loadu_pd(xyz);
 222     mem_z   = _mm_load_sd(xyz+2);
 223     mem_sxy = _mm_loadu_pd(xyz_shift);
 224     mem_sz  = _mm_load_sd(xyz_shift+2);
 225
 226     mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
 227     mem_z   = _mm_add_pd(mem_z,mem_sz);
 228
 229     tx  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
 230     ty  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
 231     tz  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
 232
 233     *x1 = gmx_mm256_set_m128(tx,tx);
 234     *y1 = gmx_mm256_set_m128(ty,ty);
 235     *z1 = gmx_mm256_set_m128(tz,tz);
 236 }
 237
 238
 239 static gmx_inline void
 240 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 241                                             const double * gmx_restrict xyz,
 242                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 243                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 244                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 245 {
 246     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
 247
 248     t1  = _mm_loadu_pd(xyz);
 249     t2  = _mm_loadu_pd(xyz+2);
 250     t3  = _mm_loadu_pd(xyz+4);
 251     t4  = _mm_loadu_pd(xyz+6);
 252     t5  = _mm_load_sd(xyz+8);
 253
 254     sxy = _mm_loadu_pd(xyz_shift);
 255     sz  = _mm_load_sd(xyz_shift+2);
 256     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
 257     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
 258
 259     t1  = _mm_add_pd(t1,sxy);
 260     t2  = _mm_add_pd(t2,szx);
 261     t3  = _mm_add_pd(t3,syz);
 262     t4  = _mm_add_pd(t4,sxy);
 263     t5  = _mm_add_sd(t5,sz);
 264
 265     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
 266     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
 267     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
 268     *x1 = gmx_mm256_set_m128(tx,tx);
 269     *y1 = gmx_mm256_set_m128(ty,ty);
 270     *z1 = gmx_mm256_set_m128(tz,tz);
 271     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
 272     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
 273     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
 274     *x2 = gmx_mm256_set_m128(tx,tx);
 275     *y2 = gmx_mm256_set_m128(ty,ty);
 276     *z2 = gmx_mm256_set_m128(tz,tz);
 277     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
 278     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
 279     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
 280     *x3 = gmx_mm256_set_m128(tx,tx);
 281     *y3 = gmx_mm256_set_m128(ty,ty);
 282     *z3 = gmx_mm256_set_m128(tz,tz);
 283 }
 284
 285
 286 static gmx_inline void
 287 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 288                                             const double * gmx_restrict xyz,
 289                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 290                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 291                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 292                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 293 {
 294     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
 295
 296     t1  = _mm_loadu_pd(xyz);
 297     t2  = _mm_loadu_pd(xyz+2);
 298     t3  = _mm_loadu_pd(xyz+4);
 299     t4  = _mm_loadu_pd(xyz+6);
 300     t5  = _mm_loadu_pd(xyz+8);
 301     t6  = _mm_loadu_pd(xyz+10);
 302
 303     sxy = _mm_loadu_pd(xyz_shift);
 304     sz  = _mm_load_sd(xyz_shift+2);
 305     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
 306     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
 307
 308     t1  = _mm_add_pd(t1,sxy);
 309     t2  = _mm_add_pd(t2,szx);
 310     t3  = _mm_add_pd(t3,syz);
 311     t4  = _mm_add_pd(t4,sxy);
 312     t5  = _mm_add_pd(t5,szx);
 313     t6  = _mm_add_pd(t6,syz);
 314
 315     tx   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
 316     ty   = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
 317     tz   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
 318     *x1 = gmx_mm256_set_m128(tx,tx);
 319     *y1 = gmx_mm256_set_m128(ty,ty);
 320     *z1 = gmx_mm256_set_m128(tz,tz);
 321     tx   = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
 322     ty   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
 323     tz   = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
 324     *x2 = gmx_mm256_set_m128(tx,tx);
 325     *y2 = gmx_mm256_set_m128(ty,ty);
 326     *z2 = gmx_mm256_set_m128(tz,tz);
 327     tx   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
 328     ty   = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
 329     tz   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
 330     *x3 = gmx_mm256_set_m128(tx,tx);
 331     *y3 = gmx_mm256_set_m128(ty,ty);
 332     *z3 = gmx_mm256_set_m128(tz,tz);
 333     tx   = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
 334     ty   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
 335     tz   = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
 336     *x4 = gmx_mm256_set_m128(tx,tx);
 337     *y4 = gmx_mm256_set_m128(ty,ty);
 338     *z4 = gmx_mm256_set_m128(tz,tz);
 339 }
 340
 341
 342 static void
 343 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 344                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 345 {
 346     __m256d t1;
 347
 348     t1            = _mm256_loadu_pd(p1);
 349     *x            = t1;
 350     *y            = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 351     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 352 }
 353
 354
 355 static void
 356 gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 357                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 358                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
 359 {
 360     __m256d t1,t2,t3;
 361
 362     t1            = _mm256_loadu_pd(p1);                         /* x2 z1 | y1 x1 */
 363     t2            = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4));  /*  -  - | z2 y2 */
 364
 365     *x1           = t1;
 366     *y2           = t2;
 367
 368     t3            = gmx_mm256_unpack128hi_pd(t1,t1);
 369
 370     *z1           = t3;
 371     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 372     *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
 373     *x2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
 374 }
 375
 376 static void
 377 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 378                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 379                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 380                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 381 {
 382     __m256d t1,t2,t3,t4;
 383
 384     t1            = _mm256_loadu_pd(p1);
 385     t3            = _mm256_loadu_pd(p1+4);
 386     *x1           = t1;
 387     *y2           = t3;
 388     t2            = gmx_mm256_unpack128hi_pd(t1,t1);
 389     t4            = gmx_mm256_unpack128hi_pd(t3,t3);
 390     *z1           = t2;
 391     *x3           = t4;
 392     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 393     *z2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
 394     *x2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
 395     *y3           = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
 396     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 397 }
 398
 399 static void
 400 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 401                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 402                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 403                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 404                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 405 {
 406     __m256d t1,t2,t3,t4,t5,t6;
 407
 408     t1            = _mm256_loadu_pd(p1);
 409     t2            = _mm256_loadu_pd(p1+4);
 410     t3            = _mm256_loadu_pd(p1+8);
 411
 412     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 413     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
 414     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
 415
 416     *x1           = t1;
 417     *y2           = t2;
 418     *z3           = t3;
 419     *z1           = t4;
 420     *x3           = t5;
 421     *y4           = t6;
 422
 423     *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
 424     *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
 425     *x4           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
 426     *x2           = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
 427     *y3           = _mm256_permute_pd(t5,_GMX_MM_PERMUTE256D(0,1,0,1));
 428     *z4           = _mm256_permute_pd(t6,_GMX_MM_PERMUTE256D(0,1,0,1));
 429 }
 430
 431
 432 static void
 433 gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 434                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 435 {
 436     __m256d tA,tB,tC;
 437
 438     tA           = _mm256_loadu_pd(ptrA); /*  - z1 | y1 x1 */
 439     tB           = _mm256_loadu_pd(ptrB); /*  - z2 | y2 x2 */
 440
 441     tC           = _mm256_unpacklo_pd(tA,tB);  /* z2 z1 | x2 x1 */
 442
 443     *x1          = tC;
 444     *y1          = _mm256_unpackhi_pd(tA,tB);
 445     *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
 446 }
 447
 448
 449 static void
 450 gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 451                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 452                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
 453 {
 454     __m256d t1,t2,t3,t4,t5;
 455
 456     t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
 457     t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
 458     t3           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));        /*   -   -  | z2a y2a */
 459     t4           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));        /*   -   -  | z2b y2b */
 460
 461     t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 462     t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 463     *y2          = _mm256_unpacklo_pd(t3,t4);      /*   -   -  | y2b y2a */
 464     *z2          = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | z2b z2a */
 465     *x1          = t5;
 466     *y1          = t1;
 467     *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
 468     *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 469 }
 470
 471
 472 static void
 473 gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 474                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 475                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 476                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 477 {
 478     __m256d t1,t2,t3,t4,t5,t6,t7;
 479
 480     t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
 481     t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
 482     t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 483     t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 484     t5           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
 485     t6           = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
 486
 487     t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 488     t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 489
 490     t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
 491     t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
 492
 493     *z3          = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | z3b z3a */
 494
 495     *x1          = t7;
 496     *y1          = t1;
 497     *y2          = t2;
 498     *z2          = t3;
 499     *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
 500     *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 501     *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
 502     *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
 503 }
 504
 505
 506 static void
 507 gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 508                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 509                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 510                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 511                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 512 {
 513     __m256d t1,t2,t3,t4,t5,t6,t7;
 514
 515     t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
 516     t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
 517     t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 518     t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 519     t5           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
 520     t6           = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
 521
 522     t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 523     t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 524
 525     t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
 526     t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
 527
 528     t4           = _mm256_unpacklo_pd(t5,t6);      /*  y4b y4a | z3b z3a */
 529     t5           = _mm256_unpackhi_pd(t5,t6);      /*  z4b z4a | x4b x4a */
 530
 531     *x1          = t7;
 532     *y1          = t1;
 533     *y2          = t2;
 534     *z2          = t3;
 535     *z3          = t4;
 536     *x4          = t5;
 537
 538     *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
 539     *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
 540     *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
 541     *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
 542     *y4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
 543     *z4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
 544 }
 545
 546
 547
 548 static void
 549 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 550                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 551                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 552 {
 553      __m256d t1,t2,t3,t4,t5,t6;
 554
 555     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 556     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 557     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 558     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 559
 560     t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 561     t6           = _mm256_unpackhi_pd(t1,t2);      /*   -   -  | y1b y1a */
 562     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1c z1c | x1d x1c */
 563     t2           = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | y1d y1c */
 564
 565     *x1          = gmx_mm256_unpack128lo_pd(t5,t1);
 566     *y1          = gmx_mm256_unpack128lo_pd(t6,t2);
 567     *z1          = gmx_mm256_unpack128hi_pd(t5,t1);
 568 }
 569
 570 static void
 571 gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 572                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 573                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 574                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
 575 {
 576     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
 577
 578     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 579     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 580     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 581     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 582     t5           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));      /*   -   -  | z2a y2a */
 583     t6           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));      /*   -   -  | z2b y2b */
 584     t7           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4));      /*   -   -  | z2c y2c */
 585     t8           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4));      /*   -   -  | z2d y2d */
 586
 587     t9           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 588     t10          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 589     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
 590     t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
 591     t3           = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | y2b y2a */
 592     t4           = _mm256_unpackhi_pd(t5,t6);      /*   -   -  | z2b z2a */
 593     t5           = _mm256_unpacklo_pd(t7,t8);      /*   -   -  | y2d y2c */
 594     t6           = _mm256_unpackhi_pd(t7,t8);      /*   -   -  | z2d z2c */
 595
 596     *x1          = gmx_mm256_unpack128lo_pd(t9,t1);
 597     *y1          = gmx_mm256_unpack128lo_pd(t10,t2);
 598     *z1          = gmx_mm256_unpack128hi_pd(t9,t1);
 599
 600     *x2          = gmx_mm256_unpack128hi_pd(t10,t2);
 601     *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
 602     *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
 603 }
 604
 605
 606 static void
 607 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 608                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 609                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 610                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 611                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 612 {
 613     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
 614
 615     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 616     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 617     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 618     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 619     t5           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 620     t6           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 621     t7           = _mm256_loadu_pd(ptrC+4);        /*  y3c x3c | z2c y2c */
 622     t8           = _mm256_loadu_pd(ptrD+4);        /*  y3d x3d | z2d y2d */
 623     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
 624     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
 625     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8));        /*   -   -  |  -  z3c */
 626     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8));        /*   -   -  |  -  z3d */
 627
 628     t13          = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 629     t14          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 630     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
 631     t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
 632
 633     t3           = _mm256_unpacklo_pd(t5,t6);      /*  x3b x3a | y2b y2a */
 634     t4           = _mm256_unpackhi_pd(t5,t6);      /*  y3b y3a | z2b z2a */
 635     t5           = _mm256_unpacklo_pd(t7,t8);      /*  x3d x3c | y2d y2c */
 636     t6           = _mm256_unpackhi_pd(t7,t8);      /*  y3d y3c | z2d z2c */
 637
 638     t9           = _mm256_unpacklo_pd(t9,t10);     /*   -   -  | z3b z3a */
 639     t11          = _mm256_unpacklo_pd(t11,t12);    /*   -   -  | z3d z3c */
 640
 641     *x1          = gmx_mm256_unpack128lo_pd(t13,t1);
 642     *y1          = gmx_mm256_unpack128lo_pd(t14,t2);
 643     *z1          = gmx_mm256_unpack128hi_pd(t13,t1);
 644     *x2          = gmx_mm256_unpack128hi_pd(t14,t2);
 645     *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
 646     *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
 647     *x3          = gmx_mm256_unpack128hi_pd(t3,t5);
 648     *y3          = gmx_mm256_unpack128hi_pd(t4,t6);
 649     *z3          = gmx_mm256_unpack128lo_pd(t9,t11);
 650 }
 651
 652
 653
 654 static void
 655 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 656                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 657                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 658                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 659                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 660                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 661 {
 662     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
 663
 664     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 665     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 666     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 667     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 668     t5           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
 669     t6           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
 670     t7           = _mm256_loadu_pd(ptrC+4);        /*  y3c x3c | z2c y2c */
 671     t8           = _mm256_loadu_pd(ptrD+4);        /*  y3d x3d | z2d y2d */
 672     t9           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
 673     t10          = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
 674     t11          = _mm256_loadu_pd(ptrC+8);        /*  z4c y4c | x4c z3c */
 675     t12          = _mm256_loadu_pd(ptrD+8);        /*  z4d y4d | x4d z3d */
 676
 677     t13          = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
 678     t14          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
 679     t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
 680     t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
 681
 682     t3           = _mm256_unpacklo_pd(t5,t6);      /*  x3b x3a | y2b y2a */
 683     t4           = _mm256_unpackhi_pd(t5,t6);      /*  y3b y3a | z2b z2a */
 684     t5           = _mm256_unpacklo_pd(t7,t8);      /*  x3d x3c | y2d y2c */
 685     t6           = _mm256_unpackhi_pd(t7,t8);      /*  y3d y3c | z2d z2c */
 686
 687     t7           = _mm256_unpacklo_pd(t9,t10);     /*  y4b y4a | z3b z3a */
 688     t8           = _mm256_unpackhi_pd(t9,t10);     /*  z4b z4a | x4b x4a */
 689     t9           = _mm256_unpacklo_pd(t11,t12);    /*  y4d y4c | z3d z3c */
 690     t10          = _mm256_unpackhi_pd(t11,t12);    /*  z4d z4c | x4d x4c */
 691
 692     *x1          = gmx_mm256_unpack128lo_pd(t13,t1);
 693     *y1          = gmx_mm256_unpack128lo_pd(t14,t2);
 694     *z1          = gmx_mm256_unpack128hi_pd(t13,t1);
 695     *x2          = gmx_mm256_unpack128hi_pd(t14,t2);
 696     *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
 697     *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
 698     *x3          = gmx_mm256_unpack128hi_pd(t3,t5);
 699     *y3          = gmx_mm256_unpack128hi_pd(t4,t6);
 700     *z3          = gmx_mm256_unpack128lo_pd(t7,t9);
 701     *x4          = gmx_mm256_unpack128lo_pd(t8,t10);
 702     *y4          = gmx_mm256_unpack128hi_pd(t7,t9);
 703     *z4          = gmx_mm256_unpack128hi_pd(t8,t10);
 704 }
 705
 706
 707
 708 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 709 static void
 710 gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
 711 {
 712     __m256d t1,t2;
 713
 714     t1  = _mm256_loadu_pd(ptrA);
 715     t2  = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
 716     t1  = _mm256_sub_pd(t1,t2);
 717     /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
 718     _mm256_storeu_pd(ptrA,t1);
 719 }
 720
 721
 722
 723 static void
 724 gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
 725                                             __m256d xyz1, __m256d xyz2, __m256d xyz3)
 726 {
 727     __m256d t1,t2;
 728     __m256d tA,tB;
 729     __m128d tC;
 730
 731     tA   = _mm256_loadu_pd(ptrA);
 732     tB   = _mm256_loadu_pd(ptrA+4);
 733     tC   = _mm_load_sd(ptrA+8);
 734
 735     /* xyz1:  -  z1 | y1 x1 */
 736     /* xyz2:  -  z2 | y2 x2 */
 737     /* xyz3:  -  z3 | y3 x3 */
 738
 739     xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
 740     t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
 741     xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
 742     xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
 743     t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
 744     xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
 745
 746     tA   = _mm256_sub_pd(tA,xyz1);
 747     tB   = _mm256_sub_pd(tB,xyz2);
 748     tC   = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
 749
 750     _mm256_storeu_pd(ptrA,tA);
 751     _mm256_storeu_pd(ptrA+4,tB);
 752     _mm_store_sd(ptrA+8,tC);
 753 }
 754
 755 static void
 756 gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
 757                                             __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
 758 {
 759     __m256d t1,t2,t3;
 760     __m256d tA,tB,tC;
 761
 762     tA   = _mm256_loadu_pd(ptrA);
 763     tB   = _mm256_loadu_pd(ptrA+4);
 764     tC   = _mm256_loadu_pd(ptrA+8);
 765
 766     /* xyz1:  -  z1 | y1 x1 */
 767     /* xyz2:  -  z2 | y2 x2 */
 768     /* xyz3:  -  z3 | y3 x3 */
 769     /* xyz4:  -  z4 | y4 x4 */
 770
 771     xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
 772     t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
 773     xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
 774     xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
 775     t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
 776     xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
 777     xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1));  /*  z4 -  | x4 y4 */
 778     t3   = _mm256_permute2f128_pd(xyz4,xyz4,0x21);    /*  x4 y4 | z4 - */
 779     t3   = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
 780     xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /*  xz y4 | x4 z3 */
 781
 782     tA   = _mm256_sub_pd(tA,xyz1);
 783     tB   = _mm256_sub_pd(tB,xyz2);
 784     tC   = _mm256_sub_pd(tC,xyz4);
 785
 786     _mm256_storeu_pd(ptrA,tA);
 787     _mm256_storeu_pd(ptrA+4,tB);
 788     _mm256_storeu_pd(ptrA+8,tC);
 789 }
 790
 791
 792
 793 static void
 794 gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 795                                           __m256d x1, __m256d y1, __m256d z1)
 796 {
 797     __m128d t1,t2,t3;
 798
 799     t1           = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
 800     t2           = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
 801     t3           = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
 802     _mm_store_sd(ptrA,t1);
 803     _mm_store_sd(ptrA+1,t2);
 804     _mm_store_sd(ptrA+2,t3);
 805 }
 806
 807
 808 static void
 809 gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 810                                           __m256d x1, __m256d y1, __m256d z1,
 811                                           __m256d x2, __m256d y2, __m256d z2)
 812 {
 813     __m256d t1;
 814     __m128d tA;
 815     t1          = _mm256_loadu_pd(ptrA);
 816     tA          = _mm_loadu_pd(ptrA+4);
 817
 818     x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
 819     z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
 820     y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
 821
 822     x1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2a z1a | y1a x1a */
 823
 824     t1          = _mm256_sub_pd(x1,t1);
 825     tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
 826
 827     _mm256_storeu_pd(ptrA,t1);
 828     _mm_storeu_pd(ptrA+4,tA);
 829 }
 830
 831
 832 static void
 833 gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 834                                           __m256d x1, __m256d y1, __m256d z1,
 835                                           __m256d x2, __m256d y2, __m256d z2,
 836                                           __m256d x3, __m256d y3, __m256d z3)
 837 {
 838     __m256d t1,t2;
 839     __m128d tA;
 840
 841     t1          = _mm256_loadu_pd(ptrA);
 842     t2          = _mm256_loadu_pd(ptrA+4);
 843     tA          = _mm_load_sd(ptrA+8);
 844
 845     x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
 846     z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
 847     y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
 848     x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
 849
 850     x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
 851     y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
 852     t1          = _mm256_sub_pd(t1,x1);
 853     t2          = _mm256_sub_pd(t2,y2);
 854     tA          = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
 855
 856     _mm256_storeu_pd(ptrA,t1);
 857     _mm256_storeu_pd(ptrA+4,t2);
 858     _mm_store_sd(ptrA+8,tA);
 859 }
 860
 861
 862 static void
 863 gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 864                                           __m256d x1, __m256d y1, __m256d z1,
 865                                           __m256d x2, __m256d y2, __m256d z2,
 866                                           __m256d x3, __m256d y3, __m256d z3,
 867                                           __m256d x4, __m256d y4, __m256d z4)
 868 {
 869     __m256d t1,t2,t3;
 870
 871     t1          = _mm256_loadu_pd(ptrA);
 872     t2          = _mm256_loadu_pd(ptrA+4);
 873     t3          = _mm256_loadu_pd(ptrA+8);
 874
 875     x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
 876     z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
 877     y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
 878     x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
 879     z3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
 880     y4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
 881
 882     x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
 883     y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
 884     z3          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
 885
 886     t1          = _mm256_sub_pd(t1,x1);
 887     t2          = _mm256_sub_pd(t2,y2);
 888     t3          = _mm256_sub_pd(t3,z3);
 889
 890     _mm256_storeu_pd(ptrA,t1);
 891     _mm256_storeu_pd(ptrA+4,t2);
 892     _mm256_storeu_pd(ptrA+8,t3);
 893 }
 894
 895 static void
 896 gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
 897                                           double * gmx_restrict ptrB,
 898                                           __m256d x1, __m256d y1, __m256d z1)
 899 {
 900     __m256d t1,t2,t3,t4;
 901     __m256i mask;
 902
 903     t3          = _mm256_loadu_pd(ptrA);
 904     t4          = _mm256_loadu_pd(ptrB);
 905
 906     t1          = _mm256_unpacklo_pd(x1,y1);   /*  -  - | y1a x1a */
 907     t2          = _mm256_unpackhi_pd(x1,y1);   /*  -  - | y1b x1b */
 908
 909     t1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
 910     z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
 911     t2          = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
 912
 913     /* Construct a mask without executing any data loads */
 914     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 915                                                       _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
 916
 917     t3          = _mm256_sub_pd(t3,t1);
 918     t4          = _mm256_sub_pd(t4,t2);
 919
 920     /* Careful with potentially overlapping stores, need to be masked */
 921     _mm256_maskstore_pd(ptrA,mask,t3);
 922     _mm256_maskstore_pd(ptrB,mask,t4);
 923 }
 924
 925 static void
 926 gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 927                                           __m256d x1, __m256d y1, __m256d z1,
 928                                           __m256d x2, __m256d y2, __m256d z2)
 929 {
 930     __m256d t1,t2,t5;
 931     __m128d t3,t4;
 932
 933     t1          = _mm256_loadu_pd(ptrA);
 934     t2          = _mm256_loadu_pd(ptrB);
 935     t3          = _mm_loadu_pd(ptrA+4);
 936     t4          = _mm_loadu_pd(ptrB+4);
 937
 938     t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
 939     x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
 940
 941     y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
 942     z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
 943
 944     x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
 945     y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
 946
 947     z2          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
 948     y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
 949
 950     t1          = _mm256_sub_pd(t1,z2);
 951     t2          = _mm256_sub_pd(t2,y1);
 952     t3          = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
 953     t4          = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
 954
 955     /* Careful with potentially overlapping stores, need to be masked */
 956     _mm256_storeu_pd(ptrA,t1);
 957     _mm256_storeu_pd(ptrB,t2);
 958     _mm_storeu_pd(ptrA+4,t3);
 959     _mm_storeu_pd(ptrB+4,t4);
 960 }
 961
 962 static void
 963 gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 964                                           __m256d x1, __m256d y1, __m256d z1,
 965                                           __m256d x2, __m256d y2, __m256d z2,
 966                                           __m256d x3, __m256d y3, __m256d z3)
 967 {
 968     __m256d t1,t2,t3,t4,t5,t6;
 969     __m128d tA,tB;
 970
 971     t1          = _mm256_loadu_pd(ptrA);
 972     t2          = _mm256_loadu_pd(ptrB);
 973     t3          = _mm256_loadu_pd(ptrA+4);
 974     t4          = _mm256_loadu_pd(ptrB+4);
 975     tA          = _mm_load_sd(ptrA+8);
 976     tB          = _mm_load_sd(ptrB+8);
 977
 978     t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
 979     x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
 980
 981     y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
 982     z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
 983
 984     x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
 985     y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
 986
 987     z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
 988     x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
 989
 990     t6          = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
 991
 992     y3          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
 993     y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
 994
 995     t5          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
 996     x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
 997
 998     t1          = _mm256_sub_pd(t1,y3);
 999     t2          = _mm256_sub_pd(t2,y1);
1000     t3          = _mm256_sub_pd(t3,t5);
1001     t4          = _mm256_sub_pd(t4,x1);
1002     tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
1003     tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
1004
1005     _mm256_storeu_pd(ptrA,t1);
1006     _mm256_storeu_pd(ptrB,t2);
1007     _mm256_storeu_pd(ptrA+4,t3);
1008     _mm256_storeu_pd(ptrB+4,t4);
1009     _mm_store_sd(ptrA+8,tA);
1010     _mm_store_sd(ptrB+8,tB);
1011 }
1012
1013
1014 static void
1015 gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1016                                           __m256d x1, __m256d y1, __m256d z1,
1017                                           __m256d x2, __m256d y2, __m256d z2,
1018                                           __m256d x3, __m256d y3, __m256d z3,
1019                                           __m256d x4, __m256d y4, __m256d z4)
1020 {
1021     __m256d t1,t2,t3,t4,t5,t6,t7;
1022
1023     t1          = _mm256_loadu_pd(ptrA);
1024     t2          = _mm256_loadu_pd(ptrB);
1025     t3          = _mm256_loadu_pd(ptrA+4);
1026     t4          = _mm256_loadu_pd(ptrB+4);
1027     t5          = _mm256_loadu_pd(ptrA+8);
1028     t6          = _mm256_loadu_pd(ptrB+8);
1029
1030     t7          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
1031     x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
1032
1033     y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
1034     z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
1035
1036     x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
1037     y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
1038
1039     z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
1040     x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
1041
1042     y3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
1043     z3          = _mm256_unpackhi_pd(z3,x4); /*  -   -  | x4b z3b */
1044     x4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
1045     y4          = _mm256_unpackhi_pd(y4,z4); /*  -   -  | z4b y4b */
1046
1047     z4          = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
1048     y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1049
1050     t7          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
1051     x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
1052
1053     x2          = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
1054     y2          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
1055
1056     t1          = _mm256_sub_pd(t1,z4);
1057     t2          = _mm256_sub_pd(t2,y1);
1058     t3          = _mm256_sub_pd(t3,t7);
1059     t4          = _mm256_sub_pd(t4,x1);
1060     t5          = _mm256_sub_pd(t5,x2);
1061     t6          = _mm256_sub_pd(t6,y2);
1062
1063     _mm256_storeu_pd(ptrA,t1);
1064     _mm256_storeu_pd(ptrB,t2);
1065     _mm256_storeu_pd(ptrA+4,t3);
1066     _mm256_storeu_pd(ptrB+4,t4);
1067     _mm256_storeu_pd(ptrA+8,t5);
1068     _mm256_storeu_pd(ptrB+8,t6);
1069 }
1070
1071
1072
1073 static void
1074 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1075                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1076                                           __m256d x1, __m256d y1, __m256d z1)
1077 {
1078     __m256d t1,t2,tA,tB,tC,tD;
1079     __m256i mask;
1080
1081     t1          = _mm256_unpacklo_pd(x1,y1); /*  y1c x1c | y1a x1a */
1082     t2          = _mm256_unpackhi_pd(x1,y1); /*  y1d x1d | y1b x1b */
1083     x1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
1084     y1          = gmx_mm256_unpack128hi_pd(t1,z1); /*  -  z1c | y1c x1c */
1085     z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(0,1,0,1));
1086     t1          = gmx_mm256_unpack128lo_pd(t2,z1); /*  -  z1b | y1b x1b */
1087     z1          = gmx_mm256_unpack128hi_pd(t2,z1); /*  -  z1d | y1d x1d */
1088
1089     /* Construct a mask without executing any data loads */
1090     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
1091                                                       _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
1092
1093     tA          = _mm256_loadu_pd(ptrA);
1094     tB          = _mm256_loadu_pd(ptrB);
1095     tC          = _mm256_loadu_pd(ptrC);
1096     tD          = _mm256_loadu_pd(ptrD);
1097
1098     tA          = _mm256_sub_pd(tA,x1);
1099     tB          = _mm256_sub_pd(tB,t1);
1100     tC          = _mm256_sub_pd(tC,y1);
1101     tD          = _mm256_sub_pd(tD,z1);
1102
1103     _mm256_maskstore_pd(ptrA,mask,tA);
1104     _mm256_maskstore_pd(ptrB,mask,tB);
1105     _mm256_maskstore_pd(ptrC,mask,tC);
1106     _mm256_maskstore_pd(ptrD,mask,tD);
1107 }
1108
1109 static void
1110 gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1111                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1112                                           __m256d x1, __m256d y1, __m256d z1,
1113                                           __m256d x2, __m256d y2, __m256d z2)
1114 {
1115     __m256d t1,t2,t3,t4,t5,t6;
1116     __m128d tA,tB,tC,tD,tE,tF;
1117
1118     t1          = _mm256_loadu_pd(ptrA);
1119     t2          = _mm256_loadu_pd(ptrB);
1120     t3          = _mm256_loadu_pd(ptrC);
1121     t4          = _mm256_loadu_pd(ptrD);
1122     tA          = _mm_loadu_pd(ptrA+4);
1123     tB          = _mm_loadu_pd(ptrB+4);
1124     tC          = _mm_loadu_pd(ptrC+4);
1125     tD          = _mm_loadu_pd(ptrD+4);
1126
1127     t5          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1128     x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1129     y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1130     z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1131     x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1132     y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1133
1134     t6          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
1135     z2          = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
1136     t5          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1137     y1          = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
1138
1139     tE          = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
1140     tF          = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
1141
1142     t1          = _mm256_sub_pd(t1,t6);
1143     t2          = _mm256_sub_pd(t2,t5);
1144     t3          = _mm256_sub_pd(t3,z2);
1145     t4          = _mm256_sub_pd(t4,y1);
1146     tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
1147     tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
1148     tC          = _mm_sub_pd(tC,tE);
1149     tD          = _mm_sub_pd(tD,tF);
1150
1151     _mm256_storeu_pd(ptrA,t1);
1152     _mm256_storeu_pd(ptrB,t2);
1153     _mm256_storeu_pd(ptrC,t3);
1154     _mm256_storeu_pd(ptrD,t4);
1155     _mm_storeu_pd(ptrA+4,tA);
1156     _mm_storeu_pd(ptrB+4,tB);
1157     _mm_storeu_pd(ptrC+4,tC);
1158     _mm_storeu_pd(ptrD+4,tD);
1159 }
1160
1161
1162 static void
1163 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1164                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1165                                           __m256d x1, __m256d y1, __m256d z1,
1166                                           __m256d x2, __m256d y2, __m256d z2,
1167                                           __m256d x3, __m256d y3, __m256d z3)
1168 {
1169     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
1170     __m128d tA,tB,tC,tD,tE;
1171
1172     t1          = _mm256_loadu_pd(ptrA);
1173     t2          = _mm256_loadu_pd(ptrB);
1174     t3          = _mm256_loadu_pd(ptrC);
1175     t4          = _mm256_loadu_pd(ptrD);
1176     t5          = _mm256_loadu_pd(ptrA+4);
1177     t6          = _mm256_loadu_pd(ptrB+4);
1178     t7          = _mm256_loadu_pd(ptrC+4);
1179     t8          = _mm256_loadu_pd(ptrD+4);
1180     tA          = _mm_load_sd(ptrA+8);
1181     tB          = _mm_load_sd(ptrB+8);
1182     tC          = _mm_load_sd(ptrC+8);
1183     tD          = _mm_load_sd(ptrD+8);
1184
1185     t9          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1186     x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1187
1188     y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1189     z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1190
1191     x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1192     y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1193
1194     z2          = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
1195     x3          = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
1196
1197     t10         = gmx_mm256_unpack128lo_pd(t9,y1);  /* x2a z1a | y1a x1a */
1198     y3          = gmx_mm256_unpack128hi_pd(t9,y1);  /* x2c z1c | y1c x1c */
1199
1200     t9          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2b z1b | y1b x1b */
1201     y1          = gmx_mm256_unpack128hi_pd(x1,z1);  /* x2d z1d | y1d x1d */
1202
1203     x1          = gmx_mm256_unpack128lo_pd(x2,z2);  /* y3a x3a | z2a y2a */
1204     z1          = gmx_mm256_unpack128hi_pd(x2,z2);  /* y3c x3c | z2c y2c */
1205
1206     x2          = gmx_mm256_unpack128lo_pd(y2,x3);  /* y3b x3b | z2b y2b */
1207     z2          = gmx_mm256_unpack128hi_pd(y2,x3);  /* y3d x3d | z2d y2d */
1208
1209     t1          = _mm256_sub_pd(t1,t10);
1210     t2          = _mm256_sub_pd(t2,t9);
1211     t3          = _mm256_sub_pd(t3,y3);
1212     t4          = _mm256_sub_pd(t4,y1);
1213     t5          = _mm256_sub_pd(t5,x1);
1214     t6          = _mm256_sub_pd(t6,x2);
1215     t7          = _mm256_sub_pd(t7,z1);
1216     t8          = _mm256_sub_pd(t8,z2);
1217
1218     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
1219     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3),_GMX_MM_PERMUTE128D(1,1)));
1220     tE          = _mm256_extractf128_pd(z3,0x1);
1221     tC          = _mm_sub_sd(tC, tE);
1222     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1)));
1223
1224     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
1225     _mm256_storeu_pd(ptrA,t1);
1226     _mm256_storeu_pd(ptrB,t2);
1227     _mm256_storeu_pd(ptrC,t3);
1228     _mm256_storeu_pd(ptrD,t4);
1229     _mm256_storeu_pd(ptrA+4,t5);
1230     _mm256_storeu_pd(ptrB+4,t6);
1231     _mm256_storeu_pd(ptrC+4,t7);
1232     _mm256_storeu_pd(ptrD+4,t8);
1233     _mm_store_sd(ptrA+8,tA);
1234     _mm_store_sd(ptrB+8,tB);
1235     _mm_store_sd(ptrC+8,tC);
1236     _mm_store_sd(ptrD+8,tD);
1237 }
1238
1239
1240 static void
1241 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1242                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1243                                           __m256d x1, __m256d y1, __m256d z1,
1244                                           __m256d x2, __m256d y2, __m256d z2,
1245                                           __m256d x3, __m256d y3, __m256d z3,
1246                                           __m256d x4, __m256d y4, __m256d z4)
1247 {
1248     __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
1249     __m128d tA,tB,tC,tD,tE;
1250
1251     t1          = _mm256_loadu_pd(ptrA);
1252     t2          = _mm256_loadu_pd(ptrB);
1253     t3          = _mm256_loadu_pd(ptrC);
1254     t4          = _mm256_loadu_pd(ptrD);
1255     t5          = _mm256_loadu_pd(ptrA+4);
1256     t6          = _mm256_loadu_pd(ptrB+4);
1257     t7          = _mm256_loadu_pd(ptrC+4);
1258     t8          = _mm256_loadu_pd(ptrD+4);
1259     t9          = _mm256_loadu_pd(ptrA+8);
1260     t10         = _mm256_loadu_pd(ptrB+8);
1261     t11         = _mm256_loadu_pd(ptrC+8);
1262     t12         = _mm256_loadu_pd(ptrD+8);
1263
1264     t13         = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1265     x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1266     y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1267     z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1268     x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1269     y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1270     z2          = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
1271     x3          = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
1272     y3          = _mm256_unpacklo_pd(z3,x4); /* x4c z3c | x4a z3a */
1273     z3          = _mm256_unpackhi_pd(z3,x4); /* x4d z3d | x4b z3b */
1274     x4          = _mm256_unpacklo_pd(y4,z4); /* z4c y4c | z4a y4a */
1275     y4          = _mm256_unpackhi_pd(y4,z4); /* z4d y4d | z4b y4b */
1276
1277     z4          = gmx_mm256_unpack128lo_pd(t13,y1); /* x2a z1a | y1a x1a */
1278     t13         = gmx_mm256_unpack128hi_pd(t13,y1); /* x2c z1c | y1c x1c */
1279     y1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2b z1b | y1b x1b */
1280     x1          = gmx_mm256_unpack128hi_pd(x1,z1);  /* x2d z1d | y1d x1d */
1281     z1          = gmx_mm256_unpack128lo_pd(x2,z2);  /* y3a x3a | z2a y2a */
1282     x2          = gmx_mm256_unpack128hi_pd(x2,z2);  /* y3c x3c | z2c y2c */
1283     z2          = gmx_mm256_unpack128lo_pd(y2,x3);  /* y3b x3b | z2b y2b */
1284     y2          = gmx_mm256_unpack128hi_pd(y2,x3);  /* y3d x3d | z2d y2d */
1285     x3          = gmx_mm256_unpack128lo_pd(y3,x4);  /* z4a y4a | x4a z3a */
1286     y3          = gmx_mm256_unpack128hi_pd(y3,x4);  /* z4c y4c | x4c z3c */
1287     x4          = gmx_mm256_unpack128lo_pd(z3,y4);  /* z4b y4b | x4b z3b */
1288     z3          = gmx_mm256_unpack128hi_pd(z3,y4);  /* z4d y4d | x4d z3d */
1289
1290     t1          = _mm256_sub_pd(t1,z4);
1291     t2          = _mm256_sub_pd(t2,y1);
1292     t3          = _mm256_sub_pd(t3,t13);
1293     t4          = _mm256_sub_pd(t4,x1);
1294     t5          = _mm256_sub_pd(t5,z1);
1295     t6          = _mm256_sub_pd(t6,z2);
1296     t7          = _mm256_sub_pd(t7,x2);
1297     t8          = _mm256_sub_pd(t8,y2);
1298     t9          = _mm256_sub_pd(t9,x3);
1299     t10         = _mm256_sub_pd(t10,x4);
1300     t11         = _mm256_sub_pd(t11,y3);
1301     t12         = _mm256_sub_pd(t12,z3);
1302
1303     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
1304     _mm256_storeu_pd(ptrA,t1);
1305     _mm256_storeu_pd(ptrB,t2);
1306     _mm256_storeu_pd(ptrC,t3);
1307     _mm256_storeu_pd(ptrD,t4);
1308     _mm256_storeu_pd(ptrA+4,t5);
1309     _mm256_storeu_pd(ptrB+4,t6);
1310     _mm256_storeu_pd(ptrC+4,t7);
1311     _mm256_storeu_pd(ptrD+4,t8);
1312     _mm256_storeu_pd(ptrA+8,t9);
1313     _mm256_storeu_pd(ptrB+8,t10);
1314     _mm256_storeu_pd(ptrC+8,t11);
1315     _mm256_storeu_pd(ptrD+8,t12);
1316 }
1317
1318
1319
1320
1321
1322 static gmx_inline void
1323 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1324                                          double * gmx_restrict fptr,
1325                                          double * gmx_restrict fshiftptr)
1326 {
1327     __m256d t1,t2;
1328     __m128d tA,tB;
1329     fix1 = _mm256_hadd_pd(fix1,fiy1);
1330     fiz1 = _mm256_hadd_pd(fiz1,_mm256_setzero_pd());
1331
1332     /* Add across the two lanes */
1333     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
1334     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
1335
1336     fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
1337
1338     t1   = _mm256_loadu_pd(fptr);
1339     t2   = _mm256_loadu_pd(fshiftptr);
1340
1341     t1   = _mm256_add_pd(t1,fix1);
1342     t2   = _mm256_add_pd(t2,fix1);
1343
1344     _mm256_storeu_pd(fptr,t1);
1345     _mm256_storeu_pd(fshiftptr,t2);
1346 }
1347
1348 static gmx_inline void
1349 gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1350                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1351                                          double * gmx_restrict fptr,
1352                                          double * gmx_restrict fshiftptr)
1353 {
1354     __m256d t1,t2,t3;
1355     __m128d tA,tB,tC,tD,tE;
1356
1357     fix1 = _mm256_hadd_pd(fix1,fiy1);
1358     fiz1 = _mm256_hadd_pd(fiz1,fix2);
1359     fiy2 = _mm256_hadd_pd(fiy2,fiz2);
1360
1361     /* Add across the two lanes by swapping and adding back */
1362     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
1363     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
1364     tC   = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
1365
1366     t1   = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
1367
1368     t2   = _mm256_loadu_pd(fptr);
1369     tD   = _mm_loadu_pd(fptr+4);
1370
1371     t2   = _mm256_add_pd(t2,t1);
1372     tD   = _mm_add_pd(tD,tC);
1373     _mm256_storeu_pd(fptr,t2);
1374     _mm_storeu_pd(fptr+4,tD);
1375
1376     /* Add up shift force */
1377     /* t1:  fix2 fiz1 | fiy1 fix1 */
1378     /* tC:              fiz2 fiy2 */
1379
1380     tA   = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
1381     tB   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1));   /* fiy2 fix2 */
1382     tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1));      /*  -   fiz2 */
1383
1384     tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
1385     tC   = _mm_add_sd(tC,tA);
1386
1387     tD   = _mm_loadu_pd(fshiftptr);
1388     tE   = _mm_load_sd(fshiftptr+2);
1389
1390     tD   = _mm_add_pd(tD,tB);
1391     tE   = _mm_add_pd(tE,tC);
1392
1393     _mm_storeu_pd(fshiftptr,tD);
1394     _mm_store_sd(fshiftptr+2,tE);
1395 }
1396
1397
1398
1399 static gmx_inline void
1400 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1401                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1402                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
1403                                          double * gmx_restrict fptr,
1404                                          double * gmx_restrict fshiftptr)
1405 {
1406     __m256d t1,t2,t3,t4;
1407     __m128d tz3,tA,tB,tC,tD;
1408
1409     fix1 = _mm256_hadd_pd(fix1,fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1410     fiz1 = _mm256_hadd_pd(fiz1,fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1411     fiy2 = _mm256_hadd_pd(fiy2,fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1412     fix3 = _mm256_hadd_pd(fix3,fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1413     fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
1414
1415     /* Add across the two lanes by swapping and adding back */
1416     t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1417     t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1418     t1   = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
1419
1420     t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1421     t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1422     t3   = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
1423
1424     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));  /* 0 z3 */
1425
1426     t2   = _mm256_loadu_pd(fptr);
1427     t4   = _mm256_loadu_pd(fptr+4);
1428     tA   = _mm_load_sd(fptr+8);
1429
1430     t2   = _mm256_add_pd(t2,t1);
1431     t4   = _mm256_add_pd(t4,t3);
1432     tA   = _mm_add_sd(tA,tz3);
1433
1434     _mm256_storeu_pd(fptr,t2);
1435     _mm256_storeu_pd(fptr+4,t4);
1436     _mm_store_sd(fptr+8,tA);
1437
1438     /* Add up shift force */
1439     /* t1:   x2 z1 | y1 x1 */
1440     /* t3:   y3 x3 | z2 y2 */
1441     /* tz3:           0 z3 */
1442
1443     /* z component */
1444     tB   = _mm256_extractf128_pd(t1,0x1);  /* x2 z1 */
1445     tC   = _mm256_extractf128_pd(t3,0x1);  /* y3 x3 */
1446     tz3  = _mm_add_sd(tz3,tB);             /* 0  z1+z3 */
1447     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3),_GMX_MM_PERMUTE128D(1,1));
1448     tz3  = _mm_add_sd(tz3,tD); /* - z */
1449
1450     tC   = _mm_add_pd(tC,_mm256_castpd256_pd128(t1));  /* y1+y3 x1+x3 */
1451
1452     tD   = _mm_shuffle_pd(tB,_mm256_castpd256_pd128(t3),_MM_SHUFFLE2(0,1)); /* y2 x2 */
1453     tC   = _mm_add_pd(tC,tD); /* y x */
1454
1455     tA   = _mm_loadu_pd(fshiftptr);
1456     tB   = _mm_load_sd(fshiftptr+2);
1457     tA   = _mm_add_pd(tA,tC);
1458     tB   = _mm_add_sd(tB,tz3);
1459     _mm_storeu_pd(fshiftptr,tA);
1460     _mm_store_sd(fshiftptr+2,tB);
1461 }
1462
1463
1464 static gmx_inline void
1465 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1466                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
1467                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
1468                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
1469                                          double * gmx_restrict fptr,
1470                                          double * gmx_restrict fshiftptr)
1471 {
1472     __m256d t1,t2,t3,t4,t5,t6;
1473     __m128d tA,tB,tC,tD;
1474
1475     fix1 = _mm256_hadd_pd(fix1,fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
1476     fiz1 = _mm256_hadd_pd(fiz1,fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
1477     fiy2 = _mm256_hadd_pd(fiy2,fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
1478     fix3 = _mm256_hadd_pd(fix3,fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
1479     fiz3 = _mm256_hadd_pd(fiz3,fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
1480     fiy4 = _mm256_hadd_pd(fiy4,fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
1481
1482     /* Add across the two lanes by swapping and adding back */
1483     t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1484     t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1485     t1   = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
1486
1487     t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1488     t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1489     t3   = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
1490
1491     t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1492     t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1493     t5   = _mm256_add_pd(t5,t6); /* z4 y4 | x4 z3 */
1494
1495     t2   = _mm256_loadu_pd(fptr);
1496     t4   = _mm256_loadu_pd(fptr+4);
1497     t6   = _mm256_loadu_pd(fptr+8);
1498
1499     t2   = _mm256_add_pd(t2,t1);
1500     t4   = _mm256_add_pd(t4,t3);
1501     t6   = _mm256_add_pd(t6,t5);
1502
1503     _mm256_storeu_pd(fptr,t2);
1504     _mm256_storeu_pd(fptr+4,t4);
1505     _mm256_storeu_pd(fptr+8,t6);
1506
1507     /* Add up shift force  */
1508     /* t1:   x2. z1. | y1. x1. */
1509     /* t3:   y3. x3. | z2 y2 */
1510     /* t5:   z4 y4 | x4. z3. */
1511
1512     /* z component */
1513     tA   = _mm256_extractf128_pd(t1,0x1);  /* x2 z1 */
1514     tB   = _mm256_extractf128_pd(t3,0x1);  /* y3 x3 */
1515     tC   = _mm256_extractf128_pd(t5,0x1);  /* z4 y4 */
1516
1517     tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));  /*  y1+y3  x1+x3 */
1518     tA   = _mm_add_pd(tA,_mm256_castpd256_pd128(t5));  /*  x2+x4  z1+z3 */
1519     tC   = _mm_add_pd(tC,_mm256_castpd256_pd128(t3));  /*  z4+z2  y4+y2 */
1520
1521     tD   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* y4+y2 x2+x4 */
1522     tB   = _mm_add_pd(tB,tD); /* y x */
1523     tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /*    - z4+z2 */
1524     tC   = _mm_add_sd(tC,tA); /* - z */
1525
1526     tA   = _mm_loadu_pd(fshiftptr);
1527     tD   = _mm_load_sd(fshiftptr+2);
1528     tA   = _mm_add_pd(tA,tB);
1529     tD   = _mm_add_sd(tD,tC);
1530     _mm_storeu_pd(fshiftptr,tA);
1531     _mm_store_sd(fshiftptr+2,tD);
1532 }
1533
1534
1535
1536 static void
1537 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1538 {
1539     __m128d t1;
1540
1541     pot1 = _mm256_hadd_pd(pot1,pot1);
1542
1543     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1544
1545     _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1546 }
1547
1548 static void
1549 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1550                       __m256d pot2, double * gmx_restrict ptrB)
1551 {
1552     __m128d t1,t2;
1553
1554     pot1 = _mm256_hadd_pd(pot1,pot2);
1555
1556     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1557
1558     t2   = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
1559     _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1560     _mm_store_sd(ptrB,_mm_add_sd(_mm_load_sd(ptrB),t2));
1561 }
1562
1563
1564 static void
1565 gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1566                          __m256d pot2, double * gmx_restrict ptrB,
1567                          __m256d pot3, double * gmx_restrict ptrC,
1568                          __m256d pot4, double * gmx_restrict ptrD)
1569 {
1570     __m256d t1,t2,t3,t4;
1571     __m128d tA,tB,tC,tD,tE,tF,tG,tH;
1572
1573     tA   = _mm_load_sd(ptrA);
1574     tB   = _mm_load_sd(ptrB);
1575     tC   = _mm_load_sd(ptrC);
1576     tD   = _mm_load_sd(ptrD);
1577
1578     /* do a transpose */
1579     t1   = _mm256_unpacklo_pd(pot1, pot2);   /* p2c p1c | p2a p1a */
1580     t2   = _mm256_unpackhi_pd(pot1, pot2);   /* p2d p1d | p2b p1b */
1581     t3   = _mm256_unpacklo_pd(pot3, pot4);   /* p4c p3c | p4a p3a */
1582     t4   = _mm256_unpackhi_pd(pot3, pot4);   /* p4d p3d | p4b p3b */
1583     pot1 = _mm256_permute2f128_pd(t1, t3, 0x20);   /* p4a p3a | p2a p1a */
1584     pot2 = _mm256_permute2f128_pd(t2, t4, 0x20);   /* p4b p3b | p2b p1b */
1585     pot3 = _mm256_permute2f128_pd(t1, t3, 0x31);   /* p4c p3c | p2c p1c */
1586     pot4 = _mm256_permute2f128_pd(t2, t4, 0x31);   /* p4d p3d | p2d p1d */
1587
1588     pot1 = _mm256_add_pd(pot1,pot2);
1589     pot3 = _mm256_add_pd(pot3,pot4);
1590     pot1 = _mm256_add_pd(pot1,pot3);  /* Sum in the four elements */
1591
1592     tE   = _mm256_castpd256_pd128(pot1);
1593     tF   = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
1594     tG   = _mm256_extractf128_pd(pot1,0x1);
1595     tH   = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
1596
1597     tA   = _mm_add_sd(tA,tE);
1598     tB   = _mm_add_sd(tB,tF);
1599     tC   = _mm_add_sd(tC,tG);
1600     tD   = _mm_add_sd(tD,tH);
1601
1602         _mm_store_sd(ptrA,tA);
1603         _mm_store_sd(ptrB,tB);
1604         _mm_store_sd(ptrC,tC);
1605         _mm_store_sd(ptrD,tD);
1606 }
1607
1608
1609 #endif /* _kernelutil_x86_avx_256_double_h_ */