src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_256_double_h_
  36 #define _kernelutil_x86_avx_256_double_h_
  37
  38 #include "config.h"
  39
  40 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  41
  42 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
  43 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  44 #define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
  45 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
  46 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
  47     {                                                        \
  48         __m256d _t0, _t1, _t2, _t3;                          \
  49         _t0  = _mm256_unpacklo_pd((row0), (row1));           \
  50         _t1  = _mm256_unpackhi_pd((row0), (row1));           \
  51         _t2  = _mm256_unpacklo_pd((row2), (row3));           \
  52         _t3  = _mm256_unpackhi_pd((row2), (row3));           \
  53         row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
  54         row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
  55         row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
  56         row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
  57     }
  58
  59 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
  60
  61 static gmx_inline __m256d gmx_simdcall
  62 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
  63 {
  64     return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
  65 }
  66
  67 static gmx_inline __m256d gmx_simdcall
  68 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
  69 {
  70     return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
  71 }
  72
  73 static gmx_inline __m256d gmx_simdcall
  74 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
  75 {
  76     return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
  77 }
  78
  79 static gmx_inline __m256 gmx_simdcall
  80 gmx_mm256_set_m128(__m128 hi, __m128 lo)
  81 {
  82     return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
  83 }
  84
  85 static gmx_inline int gmx_simdcall
  86 gmx_mm256_any_lt(__m256d a, __m256d b)
  87 {
  88     return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  89 }
  90
  91 static gmx_inline __m256d gmx_simdcall
  92 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  93 {
  94     return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
  95 }
  96
  97 /* Normal sum of four ymm registers */
  98 #define gmx_mm256_sum4_pd(t0, t1, t2, t3)  _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
  99
 100
 101 /* Load a single value from 1-4 places, merge into xmm register */
 102 static gmx_inline __m256d gmx_simdcall
 103 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
 104 {
 105     return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
 106 }
 107
 108 static gmx_inline __m256d gmx_simdcall
 109 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
 110                                 const double * gmx_restrict ptrB)
 111 {
 112     __m128d tA, tB;
 113
 114     tA = _mm_load_sd(ptrA);
 115     tB = _mm_load_sd(ptrB);
 116
 117     return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
 118 }
 119
 120
 121 static gmx_inline __m256d gmx_simdcall
 122 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 123                                 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
 124 {
 125     __m128d t1, t2;
 126
 127     t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
 128     t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
 129     return gmx_mm256_set_m128d(t2, t1);
 130 }
 131
 132
 133
 134 static gmx_inline void gmx_simdcall
 135 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 136 {
 137     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 138 }
 139
 140
 141 static gmx_inline void gmx_simdcall
 142 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 143 {
 144     __m256d t2;
 145
 146     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 147     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 148     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 149 }
 150
 151
 152
 153
 154 static gmx_inline void gmx_simdcall
 155 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 156                                  double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 157 {
 158     __m256d t2;
 159     __m128d t3, t4;
 160
 161     t2       = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
 162     t3       = _mm256_extractf128_pd(xmm1, 0x1);
 163     t4       = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 164     _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
 165     _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
 166     _mm_store_sd(ptrC, t3);
 167     _mm_store_sd(ptrD, t4);
 168 }
 169
 170
 171
 172
 173 static gmx_inline void gmx_simdcall
 174 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
 175 {
 176     __m128d t1;
 177
 178     t1   = _mm256_castpd256_pd128(xmm1);
 179     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 180
 181     _mm_store_sd(ptrA, t1);
 182 }
 183
 184
 185 static gmx_inline void gmx_simdcall
 186 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
 187 {
 188     __m128d t1, t2;
 189
 190     t1   = _mm256_castpd256_pd128(xmm1);
 191     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 192
 193     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 194     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 195
 196     _mm_store_sd(ptrA, t1);
 197     _mm_store_sd(ptrB, t2);
 198 }
 199
 200
 201 static gmx_inline void gmx_simdcall
 202 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 203                                      double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
 204 {
 205     __m128d t1, t2, t3, t4;
 206
 207     t1   = _mm256_castpd256_pd128(xmm1);
 208     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 209     t3   = _mm256_extractf128_pd(xmm1, 0x1);
 210     t4   = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
 211
 212     t1   = _mm_add_sd(t1, _mm_load_sd(ptrA));
 213     t2   = _mm_add_sd(t2, _mm_load_sd(ptrB));
 214     t3   = _mm_add_sd(t3, _mm_load_sd(ptrC));
 215     t4   = _mm_add_sd(t4, _mm_load_sd(ptrD));
 216
 217     _mm_store_sd(ptrA, t1);
 218     _mm_store_sd(ptrB, t2);
 219     _mm_store_sd(ptrC, t3);
 220     _mm_store_sd(ptrD, t4);
 221 }
 222
 223
 224
 225 static gmx_inline void gmx_simdcall
 226 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
 227 {
 228     *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
 229     *c12    = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
 230 }
 231
 232
 233 static gmx_inline void gmx_simdcall
 234 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
 235 {
 236     __m128d t1, t2, t3;
 237
 238     t1   = _mm_loadu_pd(p1);
 239     t2   = _mm_loadu_pd(p2);
 240     *c6  = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
 241     *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
 242 }
 243
 244
 245
 246 static gmx_inline void gmx_simdcall
 247 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
 248                                 const double * gmx_restrict p3, const double * gmx_restrict p4,
 249                                 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
 250 {
 251     __m256d t1, t2;
 252
 253     t1   = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c  c6c | c12a  c6a */
 254     t2   = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d  c6d | c12b  c6b */
 255
 256     *c6  = _mm256_unpacklo_pd(t1, t2);                              /* c6d c6c | c6b c6a */
 257     *c12 = _mm256_unpackhi_pd(t1, t2);                              /* c12d c12c | c12b c12a */
 258 }
 259
 260
 261 static gmx_inline void gmx_simdcall
 262 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 263                                             const double * gmx_restrict xyz,
 264                                             __m256d * gmx_restrict      x1,
 265                                             __m256d * gmx_restrict      y1,
 266                                             __m256d * gmx_restrict      z1)
 267 {
 268     __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
 269
 270     mem_xy  = _mm_loadu_pd(xyz);
 271     mem_z   = _mm_load_sd(xyz+2);
 272     mem_sxy = _mm_loadu_pd(xyz_shift);
 273     mem_sz  = _mm_load_sd(xyz_shift+2);
 274
 275     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 276     mem_z   = _mm_add_pd(mem_z, mem_sz);
 277
 278     tx  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 279     ty  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 280     tz  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 281
 282     *x1 = gmx_mm256_set_m128d(tx, tx);
 283     *y1 = gmx_mm256_set_m128d(ty, ty);
 284     *z1 = gmx_mm256_set_m128d(tz, tz);
 285 }
 286
 287
 288 static gmx_inline void gmx_simdcall
 289 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 290                                             const double * gmx_restrict xyz,
 291                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 292                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 293                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 294 {
 295     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
 296
 297     t1  = _mm_loadu_pd(xyz);
 298     t2  = _mm_loadu_pd(xyz+2);
 299     t3  = _mm_loadu_pd(xyz+4);
 300     t4  = _mm_loadu_pd(xyz+6);
 301     t5  = _mm_load_sd(xyz+8);
 302
 303     sxy = _mm_loadu_pd(xyz_shift);
 304     sz  = _mm_load_sd(xyz_shift+2);
 305     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 306     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 307
 308     t1  = _mm_add_pd(t1, sxy);
 309     t2  = _mm_add_pd(t2, szx);
 310     t3  = _mm_add_pd(t3, syz);
 311     t4  = _mm_add_pd(t4, sxy);
 312     t5  = _mm_add_sd(t5, sz);
 313
 314     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 315     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 316     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 317     *x1  = gmx_mm256_set_m128d(tx, tx);
 318     *y1  = gmx_mm256_set_m128d(ty, ty);
 319     *z1  = gmx_mm256_set_m128d(tz, tz);
 320     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 321     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 322     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 323     *x2  = gmx_mm256_set_m128d(tx, tx);
 324     *y2  = gmx_mm256_set_m128d(ty, ty);
 325     *z2  = gmx_mm256_set_m128d(tz, tz);
 326     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 327     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 328     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 329     *x3  = gmx_mm256_set_m128d(tx, tx);
 330     *y3  = gmx_mm256_set_m128d(ty, ty);
 331     *z3  = gmx_mm256_set_m128d(tz, tz);
 332 }
 333
 334
 335 static gmx_inline void gmx_simdcall
 336 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 337                                             const double * gmx_restrict xyz,
 338                                             __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 339                                             __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 340                                             __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 341                                             __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 342 {
 343     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
 344
 345     t1  = _mm_loadu_pd(xyz);
 346     t2  = _mm_loadu_pd(xyz+2);
 347     t3  = _mm_loadu_pd(xyz+4);
 348     t4  = _mm_loadu_pd(xyz+6);
 349     t5  = _mm_loadu_pd(xyz+8);
 350     t6  = _mm_loadu_pd(xyz+10);
 351
 352     sxy = _mm_loadu_pd(xyz_shift);
 353     sz  = _mm_load_sd(xyz_shift+2);
 354     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 355     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 356
 357     t1  = _mm_add_pd(t1, sxy);
 358     t2  = _mm_add_pd(t2, szx);
 359     t3  = _mm_add_pd(t3, syz);
 360     t4  = _mm_add_pd(t4, sxy);
 361     t5  = _mm_add_pd(t5, szx);
 362     t6  = _mm_add_pd(t6, syz);
 363
 364     tx   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 365     ty   = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 366     tz   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 367     *x1  = gmx_mm256_set_m128d(tx, tx);
 368     *y1  = gmx_mm256_set_m128d(ty, ty);
 369     *z1  = gmx_mm256_set_m128d(tz, tz);
 370     tx   = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 371     ty   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 372     tz   = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 373     *x2  = gmx_mm256_set_m128d(tx, tx);
 374     *y2  = gmx_mm256_set_m128d(ty, ty);
 375     *z2  = gmx_mm256_set_m128d(tz, tz);
 376     tx   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 377     ty   = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 378     tz   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 379     *x3  = gmx_mm256_set_m128d(tx, tx);
 380     *y3  = gmx_mm256_set_m128d(ty, ty);
 381     *z3  = gmx_mm256_set_m128d(tz, tz);
 382     tx   = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 383     ty   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 384     tz   = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 385     *x4  = gmx_mm256_set_m128d(tx, tx);
 386     *y4  = gmx_mm256_set_m128d(ty, ty);
 387     *z4  = gmx_mm256_set_m128d(tz, tz);
 388 }
 389
 390
 391 static gmx_inline void gmx_simdcall
 392 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 393                                      __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
 394 {
 395     __m256d t1;
 396
 397     t1            = _mm256_loadu_pd(p1);
 398     *x            = t1;
 399     *y            = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 400     *z            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 401 }
 402
 403
 404 static gmx_inline void gmx_simdcall
 405 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 406                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 407                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 408                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 409 {
 410     __m256d t1, t2, t3, t4;
 411
 412     t1            = _mm256_loadu_pd(p1);
 413     t3            = _mm256_loadu_pd(p1+4);
 414     *x1           = t1;
 415     *y2           = t3;
 416     t2            = gmx_mm256_unpack128hi_pd(t1, t1);
 417     t4            = gmx_mm256_unpack128hi_pd(t3, t3);
 418     *z1           = t2;
 419     *x3           = t4;
 420     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 421     *z2           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 422     *x2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 423     *y3           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 424     *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
 425 }
 426
 427 static gmx_inline void gmx_simdcall
 428 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 429                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 430                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 431                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 432                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 433 {
 434     __m256d t1, t2, t3, t4, t5, t6;
 435
 436     t1            = _mm256_loadu_pd(p1);
 437     t2            = _mm256_loadu_pd(p1+4);
 438     t3            = _mm256_loadu_pd(p1+8);
 439
 440     t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
 441     t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
 442     t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
 443
 444     *x1           = t1;
 445     *y2           = t2;
 446     *z3           = t3;
 447     *z1           = t4;
 448     *x3           = t5;
 449     *y4           = t6;
 450
 451     *y1           = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 452     *z2           = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 453     *x4           = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 454     *x2           = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 455     *y3           = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 456     *z4           = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 457 }
 458
 459
 460 static gmx_inline void gmx_simdcall
 461 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 462                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 463                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
 464 {
 465     __m256d t1, t2, t3, t4, t5, t6;
 466
 467     t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
 468     t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
 469     t3           = _mm256_loadu_pd(ptrC);        /*   -  z1c | y1c x1c */
 470     t4           = _mm256_loadu_pd(ptrD);        /*   -  z1d | y1d x1d */
 471
 472     t5           = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 473     t6           = _mm256_unpackhi_pd(t1, t2);   /*   -   -  | y1b y1a */
 474     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1c z1c | x1d x1c */
 475     t2           = _mm256_unpackhi_pd(t3, t4);   /*   -   -  | y1d y1c */
 476
 477     *x1          = gmx_mm256_unpack128lo_pd(t5, t1);
 478     *y1          = gmx_mm256_unpack128lo_pd(t6, t2);
 479     *z1          = gmx_mm256_unpack128hi_pd(t5, t1);
 480 }
 481
 482
 483
 484 static gmx_inline void gmx_simdcall
 485 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 486                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 487                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 488                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 489                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
 490 {
 491     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 492
 493     t1           = _mm256_loadu_pd(ptrA);                       /*  x2a z1a | y1a x1a */
 494     t2           = _mm256_loadu_pd(ptrB);                       /*  x2b z1b | y1b x1b */
 495     t3           = _mm256_loadu_pd(ptrC);                       /*  x2c z1c | y1c x1c */
 496     t4           = _mm256_loadu_pd(ptrD);                       /*  x2d z1d | y1d x1d */
 497     t5           = _mm256_loadu_pd(ptrA+4);                     /*  y3a x3a | z2a y2a */
 498     t6           = _mm256_loadu_pd(ptrB+4);                     /*  y3b x3b | z2b y2b */
 499     t7           = _mm256_loadu_pd(ptrC+4);                     /*  y3c x3c | z2c y2c */
 500     t8           = _mm256_loadu_pd(ptrD+4);                     /*  y3d x3d | z2d y2d */
 501     t9           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /*   -   -  |  -  z3a */
 502     t10          = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /*   -   -  |  -  z3b */
 503     t11          = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /*   -   -  |  -  z3c */
 504     t12          = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /*   -   -  |  -  z3d */
 505
 506     t13          = _mm256_unpacklo_pd(t1, t2);                  /*  z1b z1a | x1b x1a */
 507     t14          = _mm256_unpackhi_pd(t1, t2);                  /*  x2b x2a | y1b y1a */
 508     t1           = _mm256_unpacklo_pd(t3, t4);                  /*  z1d z1c | x1d x1c */
 509     t2           = _mm256_unpackhi_pd(t3, t4);                  /*  x2d x2c | y1d y1c */
 510
 511     t3           = _mm256_unpacklo_pd(t5, t6);                  /*  x3b x3a | y2b y2a */
 512     t4           = _mm256_unpackhi_pd(t5, t6);                  /*  y3b y3a | z2b z2a */
 513     t5           = _mm256_unpacklo_pd(t7, t8);                  /*  x3d x3c | y2d y2c */
 514     t6           = _mm256_unpackhi_pd(t7, t8);                  /*  y3d y3c | z2d z2c */
 515
 516     t9           = _mm256_unpacklo_pd(t9, t10);                 /*   -   -  | z3b z3a */
 517     t11          = _mm256_unpacklo_pd(t11, t12);                /*   -   -  | z3d z3c */
 518
 519     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 520     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 521     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 522     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 523     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 524     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 525     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 526     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 527     *z3          = gmx_mm256_unpack128lo_pd(t9, t11);
 528 }
 529
 530
 531
 532 static gmx_inline void gmx_simdcall
 533 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 534                                      const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
 535                                      __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
 536                                      __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
 537                                      __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
 538                                      __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
 539 {
 540     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 541
 542     t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
 543     t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
 544     t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
 545     t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
 546     t5           = _mm256_loadu_pd(ptrA+4);      /*  y3a x3a | z2a y2a */
 547     t6           = _mm256_loadu_pd(ptrB+4);      /*  y3b x3b | z2b y2b */
 548     t7           = _mm256_loadu_pd(ptrC+4);      /*  y3c x3c | z2c y2c */
 549     t8           = _mm256_loadu_pd(ptrD+4);      /*  y3d x3d | z2d y2d */
 550     t9           = _mm256_loadu_pd(ptrA+8);      /*  z4a y4a | x4a z3a */
 551     t10          = _mm256_loadu_pd(ptrB+8);      /*  z4b y4b | x4b z3b */
 552     t11          = _mm256_loadu_pd(ptrC+8);      /*  z4c y4c | x4c z3c */
 553     t12          = _mm256_loadu_pd(ptrD+8);      /*  z4d y4d | x4d z3d */
 554
 555     t13          = _mm256_unpacklo_pd(t1, t2);   /*  z1b z1a | x1b x1a */
 556     t14          = _mm256_unpackhi_pd(t1, t2);   /*  x2b x2a | y1b y1a */
 557     t1           = _mm256_unpacklo_pd(t3, t4);   /*  z1d z1c | x1d x1c */
 558     t2           = _mm256_unpackhi_pd(t3, t4);   /*  x2d x2c | y1d y1c */
 559
 560     t3           = _mm256_unpacklo_pd(t5, t6);   /*  x3b x3a | y2b y2a */
 561     t4           = _mm256_unpackhi_pd(t5, t6);   /*  y3b y3a | z2b z2a */
 562     t5           = _mm256_unpacklo_pd(t7, t8);   /*  x3d x3c | y2d y2c */
 563     t6           = _mm256_unpackhi_pd(t7, t8);   /*  y3d y3c | z2d z2c */
 564
 565     t7           = _mm256_unpacklo_pd(t9, t10);  /*  y4b y4a | z3b z3a */
 566     t8           = _mm256_unpackhi_pd(t9, t10);  /*  z4b z4a | x4b x4a */
 567     t9           = _mm256_unpacklo_pd(t11, t12); /*  y4d y4c | z3d z3c */
 568     t10          = _mm256_unpackhi_pd(t11, t12); /*  z4d z4c | x4d x4c */
 569
 570     *x1          = gmx_mm256_unpack128lo_pd(t13, t1);
 571     *y1          = gmx_mm256_unpack128lo_pd(t14, t2);
 572     *z1          = gmx_mm256_unpack128hi_pd(t13, t1);
 573     *x2          = gmx_mm256_unpack128hi_pd(t14, t2);
 574     *y2          = gmx_mm256_unpack128lo_pd(t3, t5);
 575     *z2          = gmx_mm256_unpack128lo_pd(t4, t6);
 576     *x3          = gmx_mm256_unpack128hi_pd(t3, t5);
 577     *y3          = gmx_mm256_unpack128hi_pd(t4, t6);
 578     *z3          = gmx_mm256_unpack128lo_pd(t7, t9);
 579     *x4          = gmx_mm256_unpack128lo_pd(t8, t10);
 580     *y4          = gmx_mm256_unpack128hi_pd(t7, t9);
 581     *z4          = gmx_mm256_unpack128hi_pd(t8, t10);
 582 }
 583
 584
 585
 586 static gmx_inline void gmx_simdcall
 587 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 588                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 589                                           __m256d x1, __m256d y1, __m256d z1)
 590 {
 591     __m256d t1, t2, tA, tB, tC, tD;
 592     __m256i mask;
 593
 594     t1          = _mm256_unpacklo_pd(x1, y1);       /*  y1c x1c | y1a x1a */
 595     t2          = _mm256_unpackhi_pd(x1, y1);       /*  y1d x1d | y1b x1b */
 596     x1          = gmx_mm256_unpack128lo_pd(t1, z1); /*  -  z1a | y1a x1a */
 597     y1          = gmx_mm256_unpack128hi_pd(t1, z1); /*  -  z1c | y1c x1c */
 598     z1          = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
 599     t1          = gmx_mm256_unpack128lo_pd(t2, z1); /*  -  z1b | y1b x1b */
 600     z1          = gmx_mm256_unpack128hi_pd(t2, z1); /*  -  z1d | y1d x1d */
 601
 602     /* Construct a mask without executing any data loads */
 603     mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
 604                                                       _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
 605
 606     tA          = _mm256_loadu_pd(ptrA);
 607     tB          = _mm256_loadu_pd(ptrB);
 608     tC          = _mm256_loadu_pd(ptrC);
 609     tD          = _mm256_loadu_pd(ptrD);
 610
 611     tA          = _mm256_sub_pd(tA, x1);
 612     tB          = _mm256_sub_pd(tB, t1);
 613     tC          = _mm256_sub_pd(tC, y1);
 614     tD          = _mm256_sub_pd(tD, z1);
 615
 616     _mm256_maskstore_pd(ptrA, mask, tA);
 617     _mm256_maskstore_pd(ptrB, mask, tB);
 618     _mm256_maskstore_pd(ptrC, mask, tC);
 619     _mm256_maskstore_pd(ptrD, mask, tD);
 620 }
 621
 622
 623
 624
 625 static gmx_inline void gmx_simdcall
 626 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 627                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 628                                           __m256d x1, __m256d y1, __m256d z1,
 629                                           __m256d x2, __m256d y2, __m256d z2,
 630                                           __m256d x3, __m256d y3, __m256d z3)
 631 {
 632     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 633     __m128d tA, tB, tC, tD, tE;
 634
 635     t1          = _mm256_loadu_pd(ptrA);
 636     t2          = _mm256_loadu_pd(ptrB);
 637     t3          = _mm256_loadu_pd(ptrC);
 638     t4          = _mm256_loadu_pd(ptrD);
 639     t5          = _mm256_loadu_pd(ptrA+4);
 640     t6          = _mm256_loadu_pd(ptrB+4);
 641     t7          = _mm256_loadu_pd(ptrC+4);
 642     t8          = _mm256_loadu_pd(ptrD+4);
 643     tA          = _mm_load_sd(ptrA+8);
 644     tB          = _mm_load_sd(ptrB+8);
 645     tC          = _mm_load_sd(ptrC+8);
 646     tD          = _mm_load_sd(ptrD+8);
 647
 648     t9          = _mm256_unpacklo_pd(x1, y1);       /* y1c x1c | y1a x1a */
 649     x1          = _mm256_unpackhi_pd(x1, y1);       /* y1d x1d | y1b x1b */
 650
 651     y1          = _mm256_unpacklo_pd(z1, x2);       /* x2c z1c | x2a z1a */
 652     z1          = _mm256_unpackhi_pd(z1, x2);       /* x2d z1d | x2b z1b */
 653
 654     x2          = _mm256_unpacklo_pd(y2, z2);       /* z2c y2c | z2a y2a */
 655     y2          = _mm256_unpackhi_pd(y2, z2);       /* z2d y2d | z2b y2b */
 656
 657     z2          = _mm256_unpacklo_pd(x3, y3);       /* y3c x3c | y3a x3a */
 658     x3          = _mm256_unpackhi_pd(x3, y3);       /* y3d x3d | y3b x3b */
 659
 660     t10         = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
 661     y3          = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
 662
 663     t9          = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
 664     y1          = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
 665
 666     x1          = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
 667     z1          = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
 668
 669     x2          = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
 670     z2          = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
 671
 672     t1          = _mm256_sub_pd(t1, t10);
 673     t2          = _mm256_sub_pd(t2, t9);
 674     t3          = _mm256_sub_pd(t3, y3);
 675     t4          = _mm256_sub_pd(t4, y1);
 676     t5          = _mm256_sub_pd(t5, x1);
 677     t6          = _mm256_sub_pd(t6, x2);
 678     t7          = _mm256_sub_pd(t7, z1);
 679     t8          = _mm256_sub_pd(t8, z2);
 680
 681     tA          = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
 682     tB          = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
 683     tE          = _mm256_extractf128_pd(z3, 0x1);
 684     tC          = _mm_sub_sd(tC, tE);
 685     tD          = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
 686
 687     /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
 688     _mm256_storeu_pd(ptrA, t1);
 689     _mm256_storeu_pd(ptrB, t2);
 690     _mm256_storeu_pd(ptrC, t3);
 691     _mm256_storeu_pd(ptrD, t4);
 692     _mm256_storeu_pd(ptrA+4, t5);
 693     _mm256_storeu_pd(ptrB+4, t6);
 694     _mm256_storeu_pd(ptrC+4, t7);
 695     _mm256_storeu_pd(ptrD+4, t8);
 696     _mm_store_sd(ptrA+8, tA);
 697     _mm_store_sd(ptrB+8, tB);
 698     _mm_store_sd(ptrC+8, tC);
 699     _mm_store_sd(ptrD+8, tD);
 700 }
 701
 702
 703 static gmx_inline void gmx_simdcall
 704 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 705                                           double * gmx_restrict ptrC, double * gmx_restrict ptrD,
 706                                           __m256d x1, __m256d y1, __m256d z1,
 707                                           __m256d x2, __m256d y2, __m256d z2,
 708                                           __m256d x3, __m256d y3, __m256d z3,
 709                                           __m256d x4, __m256d y4, __m256d z4)
 710 {
 711     __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
 712     __m128d tA, tB, tC, tD, tE;
 713
 714     t1          = _mm256_loadu_pd(ptrA);
 715     t2          = _mm256_loadu_pd(ptrB);
 716     t3          = _mm256_loadu_pd(ptrC);
 717     t4          = _mm256_loadu_pd(ptrD);
 718     t5          = _mm256_loadu_pd(ptrA+4);
 719     t6          = _mm256_loadu_pd(ptrB+4);
 720     t7          = _mm256_loadu_pd(ptrC+4);
 721     t8          = _mm256_loadu_pd(ptrD+4);
 722     t9          = _mm256_loadu_pd(ptrA+8);
 723     t10         = _mm256_loadu_pd(ptrB+8);
 724     t11         = _mm256_loadu_pd(ptrC+8);
 725     t12         = _mm256_loadu_pd(ptrD+8);
 726
 727     t13         = _mm256_unpacklo_pd(x1, y1);        /* y1c x1c | y1a x1a */
 728     x1          = _mm256_unpackhi_pd(x1, y1);        /* y1d x1d | y1b x1b */
 729     y1          = _mm256_unpacklo_pd(z1, x2);        /* x2c z1c | x2a z1a */
 730     z1          = _mm256_unpackhi_pd(z1, x2);        /* x2d z1d | x2b z1b */
 731     x2          = _mm256_unpacklo_pd(y2, z2);        /* z2c y2c | z2a y2a */
 732     y2          = _mm256_unpackhi_pd(y2, z2);        /* z2d y2d | z2b y2b */
 733     z2          = _mm256_unpacklo_pd(x3, y3);        /* y3c x3c | y3a x3a */
 734     x3          = _mm256_unpackhi_pd(x3, y3);        /* y3d x3d | y3b x3b */
 735     y3          = _mm256_unpacklo_pd(z3, x4);        /* x4c z3c | x4a z3a */
 736     z3          = _mm256_unpackhi_pd(z3, x4);        /* x4d z3d | x4b z3b */
 737     x4          = _mm256_unpacklo_pd(y4, z4);        /* z4c y4c | z4a y4a */
 738     y4          = _mm256_unpackhi_pd(y4, z4);        /* z4d y4d | z4b y4b */
 739
 740     z4          = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
 741     t13         = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
 742     y1          = gmx_mm256_unpack128lo_pd(x1, z1);  /* x2b z1b | y1b x1b */
 743     x1          = gmx_mm256_unpack128hi_pd(x1, z1);  /* x2d z1d | y1d x1d */
 744     z1          = gmx_mm256_unpack128lo_pd(x2, z2);  /* y3a x3a | z2a y2a */
 745     x2          = gmx_mm256_unpack128hi_pd(x2, z2);  /* y3c x3c | z2c y2c */
 746     z2          = gmx_mm256_unpack128lo_pd(y2, x3);  /* y3b x3b | z2b y2b */
 747     y2          = gmx_mm256_unpack128hi_pd(y2, x3);  /* y3d x3d | z2d y2d */
 748     x3          = gmx_mm256_unpack128lo_pd(y3, x4);  /* z4a y4a | x4a z3a */
 749     y3          = gmx_mm256_unpack128hi_pd(y3, x4);  /* z4c y4c | x4c z3c */
 750     x4          = gmx_mm256_unpack128lo_pd(z3, y4);  /* z4b y4b | x4b z3b */
 751     z3          = gmx_mm256_unpack128hi_pd(z3, y4);  /* z4d y4d | x4d z3d */
 752
 753     t1          = _mm256_sub_pd(t1, z4);
 754     t2          = _mm256_sub_pd(t2, y1);
 755     t3          = _mm256_sub_pd(t3, t13);
 756     t4          = _mm256_sub_pd(t4, x1);
 757     t5          = _mm256_sub_pd(t5, z1);
 758     t6          = _mm256_sub_pd(t6, z2);
 759     t7          = _mm256_sub_pd(t7, x2);
 760     t8          = _mm256_sub_pd(t8, y2);
 761     t9          = _mm256_sub_pd(t9, x3);
 762     t10         = _mm256_sub_pd(t10, x4);
 763     t11         = _mm256_sub_pd(t11, y3);
 764     t12         = _mm256_sub_pd(t12, z3);
 765
 766     /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
 767     _mm256_storeu_pd(ptrA, t1);
 768     _mm256_storeu_pd(ptrB, t2);
 769     _mm256_storeu_pd(ptrC, t3);
 770     _mm256_storeu_pd(ptrD, t4);
 771     _mm256_storeu_pd(ptrA+4, t5);
 772     _mm256_storeu_pd(ptrB+4, t6);
 773     _mm256_storeu_pd(ptrC+4, t7);
 774     _mm256_storeu_pd(ptrD+4, t8);
 775     _mm256_storeu_pd(ptrA+8, t9);
 776     _mm256_storeu_pd(ptrB+8, t10);
 777     _mm256_storeu_pd(ptrC+8, t11);
 778     _mm256_storeu_pd(ptrD+8, t12);
 779 }
 780
 781
 782
 783 static gmx_inline void gmx_simdcall
 784 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 785                                          double * gmx_restrict fptr,
 786                                          double * gmx_restrict fshiftptr)
 787 {
 788     __m256d t1, t2;
 789     __m128d tA, tB;
 790     fix1 = _mm256_hadd_pd(fix1, fiy1);
 791     fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
 792
 793     /* Add across the two lanes */
 794     tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
 795     tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
 796
 797     fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
 798
 799     t1   = _mm256_loadu_pd(fptr);
 800     t2   = _mm256_loadu_pd(fshiftptr);
 801
 802     t1   = _mm256_add_pd(t1, fix1);
 803     t2   = _mm256_add_pd(t2, fix1);
 804
 805     _mm256_storeu_pd(fptr, t1);
 806     _mm256_storeu_pd(fshiftptr, t2);
 807 }
 808
 809
 810
 811
 812 static gmx_inline void gmx_simdcall
 813 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 814                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 815                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 816                                          double * gmx_restrict fptr,
 817                                          double * gmx_restrict fshiftptr)
 818 {
 819     __m256d t1, t2, t3, t4;
 820     __m128d tz3, tA, tB, tC, tD;
 821
 822     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 823     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 824     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 825     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 826     fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /*  0     Z3c-d | 0     Z3a-b */
 827
 828     /* Add across the two lanes by swapping and adding back */
 829     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1);                                       /* X2a-b Z1a-b | Y1a-b X1a-b */
 830     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1);                                       /* X2c-d Z1c-d | Y1c-d X1c-d */
 831     t1   = _mm256_add_pd(t1, t2);                                                      /* x2 z1 | y1 x1 */
 832
 833     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3);                                       /* Y3a-b X3a-b | Z2a-b Y2a-b */
 834     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3);                                       /* Y3c-d X3c-d | Z2c-d Y2c-d */
 835     t3   = _mm256_add_pd(t3, t4);                                                      /* y3 x3 | z2 y2 */
 836
 837     tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
 838
 839     t2   = _mm256_loadu_pd(fptr);
 840     t4   = _mm256_loadu_pd(fptr+4);
 841     tA   = _mm_load_sd(fptr+8);
 842
 843     t2   = _mm256_add_pd(t2, t1);
 844     t4   = _mm256_add_pd(t4, t3);
 845     tA   = _mm_add_sd(tA, tz3);
 846
 847     _mm256_storeu_pd(fptr, t2);
 848     _mm256_storeu_pd(fptr+4, t4);
 849     _mm_store_sd(fptr+8, tA);
 850
 851     /* Add up shift force */
 852     /* t1:   x2 z1 | y1 x1 */
 853     /* t3:   y3 x3 | z2 y2 */
 854     /* tz3:           0 z3 */
 855
 856     /* z component */
 857     tB   = _mm256_extractf128_pd(t1, 0x1);                                     /* x2 z1 */
 858     tC   = _mm256_extractf128_pd(t3, 0x1);                                     /* y3 x3 */
 859     tz3  = _mm_add_sd(tz3, tB);                                                /* 0  z1+z3 */
 860     tD   = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
 861     tz3  = _mm_add_sd(tz3, tD);                                                /* - z */
 862
 863     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t1));                         /* y1+y3 x1+x3 */
 864
 865     tD   = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
 866     tC   = _mm_add_pd(tC, tD);                                                 /* y x */
 867
 868     tA   = _mm_loadu_pd(fshiftptr);
 869     tB   = _mm_load_sd(fshiftptr+2);
 870     tA   = _mm_add_pd(tA, tC);
 871     tB   = _mm_add_sd(tB, tz3);
 872     _mm_storeu_pd(fshiftptr, tA);
 873     _mm_store_sd(fshiftptr+2, tB);
 874 }
 875
 876
 877 static gmx_inline void gmx_simdcall
 878 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
 879                                          __m256d fix2, __m256d fiy2, __m256d fiz2,
 880                                          __m256d fix3, __m256d fiy3, __m256d fiz3,
 881                                          __m256d fix4, __m256d fiy4, __m256d fiz4,
 882                                          double * gmx_restrict fptr,
 883                                          double * gmx_restrict fshiftptr)
 884 {
 885     __m256d t1, t2, t3, t4, t5, t6;
 886     __m128d tA, tB, tC, tD;
 887
 888     fix1 = _mm256_hadd_pd(fix1, fiy1);                /*  Y1c-d X1c-d | Y1a-b X1a-b */
 889     fiz1 = _mm256_hadd_pd(fiz1, fix2);                /*  X2c-d Z1c-d | X2a-b Z1a-b */
 890     fiy2 = _mm256_hadd_pd(fiy2, fiz2);                /*  Z2c-d Y2c-d | Z2a-b Y2a-b */
 891     fix3 = _mm256_hadd_pd(fix3, fiy3);                /*  Y3c-d X3c-d | Y3a-b X3a-b */
 892     fiz3 = _mm256_hadd_pd(fiz3, fix4);                /*  X4c-d Z3c-d | X4a-b Z3a-b */
 893     fiy4 = _mm256_hadd_pd(fiy4, fiz4);                /*  Z4c-d Y4c-d | Z4a-b Y4a-b */
 894
 895     /* Add across the two lanes by swapping and adding back */
 896     t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
 897     t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
 898     t1   = _mm256_add_pd(t1, t2);                /* x2 z1 | y1 x1 */
 899
 900     t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
 901     t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
 902     t3   = _mm256_add_pd(t3, t4);                /* y3 x3 | z2 y2 */
 903
 904     t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
 905     t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
 906     t5   = _mm256_add_pd(t5, t6);                /* z4 y4 | x4 z3 */
 907
 908     t2   = _mm256_loadu_pd(fptr);
 909     t4   = _mm256_loadu_pd(fptr+4);
 910     t6   = _mm256_loadu_pd(fptr+8);
 911
 912     t2   = _mm256_add_pd(t2, t1);
 913     t4   = _mm256_add_pd(t4, t3);
 914     t6   = _mm256_add_pd(t6, t5);
 915
 916     _mm256_storeu_pd(fptr, t2);
 917     _mm256_storeu_pd(fptr+4, t4);
 918     _mm256_storeu_pd(fptr+8, t6);
 919
 920     /* Add up shift force  */
 921     /* t1:   x2. z1. | y1. x1. */
 922     /* t3:   y3. x3. | z2 y2 */
 923     /* t5:   z4 y4 | x4. z3. */
 924
 925     /* z component */
 926     tA   = _mm256_extractf128_pd(t1, 0x1);                /* x2 z1 */
 927     tB   = _mm256_extractf128_pd(t3, 0x1);                /* y3 x3 */
 928     tC   = _mm256_extractf128_pd(t5, 0x1);                /* z4 y4 */
 929
 930     tB   = _mm_add_pd(tB, _mm256_castpd256_pd128(t1));    /*  y1+y3  x1+x3 */
 931     tA   = _mm_add_pd(tA, _mm256_castpd256_pd128(t5));    /*  x2+x4  z1+z3 */
 932     tC   = _mm_add_pd(tC, _mm256_castpd256_pd128(t3));    /*  z4+z2  y4+y2 */
 933
 934     tD   = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1));    /* y4+y2 x2+x4 */
 935     tB   = _mm_add_pd(tB, tD);                            /* y x */
 936     tC   = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /*    - z4+z2 */
 937     tC   = _mm_add_sd(tC, tA);                            /* - z */
 938
 939     tA   = _mm_loadu_pd(fshiftptr);
 940     tD   = _mm_load_sd(fshiftptr+2);
 941     tA   = _mm_add_pd(tA, tB);
 942     tD   = _mm_add_sd(tD, tC);
 943     _mm_storeu_pd(fshiftptr, tA);
 944     _mm_store_sd(fshiftptr+2, tD);
 945 }
 946
 947
 948 static gmx_inline void gmx_simdcall
 949 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
 950 {
 951     __m128d t1;
 952
 953     pot1 = _mm256_hadd_pd(pot1, pot1);
 954
 955     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
 956
 957     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
 958 }
 959
 960 static gmx_inline void gmx_simdcall
 961 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
 962                          __m256d pot2, double * gmx_restrict ptrB)
 963 {
 964     __m128d t1, t2;
 965
 966     pot1 = _mm256_hadd_pd(pot1, pot2);
 967
 968     t1   = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
 969
 970     t2   = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
 971     _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
 972     _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
 973 }
 974
 975
 976 #endif /* _kernelutil_x86_avx_256_double_h_ */