src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
  36 #define _kernelutil_x86_avx_128_fma_double_h_
  37
  38 #include <math.h>
  39
  40 #include <immintrin.h>
  41 #ifdef _MSC_VER
  42 #    include <intrin.h>
  43 #else
  44 #    include <x86intrin.h>
  45 #endif
  46
  47 #include "config.h"
  48
  49 #define gmx_mm_castsi128_pd   _mm_castsi128_pd
  50 #define gmx_mm_extract_epi32  _mm_extract_epi32
  51
  52 #define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
  53         __m128d __gmx_t1 = row0;                         \
  54         row0           = _mm_unpacklo_pd(row0, row1);     \
  55         row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
  56 }
  57
  58 static gmx_inline int gmx_simdcall
  59 gmx_mm_any_lt(__m128d a, __m128d b)
  60 {
  61     return _mm_movemask_pd(_mm_cmplt_pd(a, b));
  62 }
  63
  64
  65 static gmx_inline __m128d gmx_simdcall
  66 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  67 {
  68     return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
  69 }
  70
  71 /* Normal sum of four ymm registers */
  72 #define gmx_mm_sum4_pd(t0, t1, t2, t3)  _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
  73
  74
  75
  76 /* Load a double value from 1-2 places, merge into xmm register */
  77
  78
  79 static gmx_inline __m128d gmx_simdcall
  80 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  81                              const double * gmx_restrict ptrB)
  82 {
  83     return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  84 }
  85
  86 static gmx_inline __m128d gmx_simdcall
  87 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
  88 {
  89     return _mm_load_sd(ptrA);
  90 }
  91
  92
  93 static gmx_inline void gmx_simdcall
  94 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
  95                               double * gmx_restrict ptrB,
  96                               __m128d               xmm1)
  97 {
  98     __m128d t2;
  99
 100     t2       = _mm_unpackhi_pd(xmm1, xmm1);
 101     _mm_store_sd(ptrA, xmm1);
 102     _mm_store_sd(ptrB, t2);
 103 }
 104
 105 static gmx_inline void gmx_simdcall
 106 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 107 {
 108     _mm_store_sd(ptrA, xmm1);
 109 }
 110
 111
 112 /* Similar to store, but increments value in memory */
 113 static gmx_inline void gmx_simdcall
 114 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
 115                                   double * gmx_restrict ptrB, __m128d xmm1)
 116 {
 117     __m128d t1;
 118
 119     t1   = _mm_unpackhi_pd(xmm1, xmm1);
 120     xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
 121     t1   = _mm_add_sd(t1, _mm_load_sd(ptrB));
 122     _mm_store_sd(ptrA, xmm1);
 123     _mm_store_sd(ptrB, t1);
 124 }
 125
 126 static gmx_inline void gmx_simdcall
 127 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 128 {
 129     __m128d tmp;
 130
 131     tmp = gmx_mm_load_1real_pd(ptrA);
 132     tmp = _mm_add_sd(tmp, xmm1);
 133     gmx_mm_store_1real_pd(ptrA, tmp);
 134 }
 135
 136
 137
 138 static gmx_inline void gmx_simdcall
 139 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
 140                              const double * gmx_restrict p2,
 141                              __m128d * gmx_restrict      c6,
 142                              __m128d * gmx_restrict      c12)
 143 {
 144     __m128d t1, t2, t3;
 145
 146     /* The c6/c12 array should be aligned */
 147     t1   = _mm_loadu_pd(p1);
 148     t2   = _mm_loadu_pd(p2);
 149     *c6  = _mm_unpacklo_pd(t1, t2);
 150     *c12 = _mm_unpackhi_pd(t1, t2);
 151 }
 152
 153 static gmx_inline void gmx_simdcall
 154 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 155                              __m128d * gmx_restrict      c6,
 156                              __m128d * gmx_restrict      c12)
 157 {
 158     *c6     = _mm_load_sd(p1);
 159     *c12    = _mm_load_sd(p1+1);
 160 }
 161
 162
 163 static gmx_inline void gmx_simdcall
 164 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 165                                          const double * gmx_restrict xyz,
 166                                          __m128d * gmx_restrict      x1,
 167                                          __m128d * gmx_restrict      y1,
 168                                          __m128d * gmx_restrict      z1)
 169 {
 170     __m128d mem_xy, mem_z, mem_sxy, mem_sz;
 171
 172     mem_xy  = _mm_loadu_pd(xyz);
 173     mem_z   = _mm_load_sd(xyz+2);
 174     mem_sxy = _mm_loadu_pd(xyz_shift);
 175     mem_sz  = _mm_load_sd(xyz_shift+2);
 176
 177     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
 178     mem_z   = _mm_add_pd(mem_z, mem_sz);
 179
 180     *x1  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
 181     *y1  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
 182     *z1  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
 183 }
 184
 185
 186 static gmx_inline void gmx_simdcall
 187 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 188                                          const double * gmx_restrict xyz,
 189                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 190                                          __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 191                                          __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 192 {
 193     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
 194
 195     t1  = _mm_loadu_pd(xyz);
 196     t2  = _mm_loadu_pd(xyz+2);
 197     t3  = _mm_loadu_pd(xyz+4);
 198     t4  = _mm_loadu_pd(xyz+6);
 199     t5  = _mm_load_sd(xyz+8);
 200
 201     sxy = _mm_loadu_pd(xyz_shift);
 202     sz  = _mm_load_sd(xyz_shift+2);
 203     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 204     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 205
 206     t1  = _mm_add_pd(t1, sxy);
 207     t2  = _mm_add_pd(t2, szx);
 208     t3  = _mm_add_pd(t3, syz);
 209     t4  = _mm_add_pd(t4, sxy);
 210     t5  = _mm_add_sd(t5, sz);
 211
 212     *x1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 213     *y1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 214     *z1  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 215     *x2  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 216     *y2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 217     *z2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 218     *x3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 219     *y3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 220     *z3  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 221 }
 222
 223
 224 static gmx_inline void gmx_simdcall
 225 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 226                                          const double * gmx_restrict xyz,
 227                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 228                                          __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 229                                          __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
 230                                          __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 231 {
 232     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
 233
 234     t1  = _mm_loadu_pd(xyz);
 235     t2  = _mm_loadu_pd(xyz+2);
 236     t3  = _mm_loadu_pd(xyz+4);
 237     t4  = _mm_loadu_pd(xyz+6);
 238     t5  = _mm_loadu_pd(xyz+8);
 239     t6  = _mm_loadu_pd(xyz+10);
 240
 241     sxy = _mm_loadu_pd(xyz_shift);
 242     sz  = _mm_load_sd(xyz_shift+2);
 243     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
 244     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
 245
 246     t1  = _mm_add_pd(t1, sxy);
 247     t2  = _mm_add_pd(t2, szx);
 248     t3  = _mm_add_pd(t3, syz);
 249     t4  = _mm_add_pd(t4, sxy);
 250     t5  = _mm_add_pd(t5, szx);
 251     t6  = _mm_add_pd(t6, syz);
 252
 253     *x1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
 254     *y1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
 255     *z1  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
 256     *x2  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
 257     *y2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
 258     *z2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
 259     *x3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
 260     *y3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
 261     *z3  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
 262     *x4  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
 263     *y4  = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
 264     *z4  = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
 265 }
 266
 267
 268
 269 static gmx_inline void gmx_simdcall
 270 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 271                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 272 {
 273     *x            = _mm_load_sd(p1);
 274     *y            = _mm_load_sd(p1+1);
 275     *z            = _mm_load_sd(p1+2);
 276 }
 277
 278 static gmx_inline void gmx_simdcall
 279 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 280                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 281                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 282                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 283 {
 284     *x1            = _mm_load_sd(p1);
 285     *y1            = _mm_load_sd(p1+1);
 286     *z1            = _mm_load_sd(p1+2);
 287     *x2            = _mm_load_sd(p1+3);
 288     *y2            = _mm_load_sd(p1+4);
 289     *z2            = _mm_load_sd(p1+5);
 290     *x3            = _mm_load_sd(p1+6);
 291     *y3            = _mm_load_sd(p1+7);
 292     *z3            = _mm_load_sd(p1+8);
 293 }
 294
 295 static gmx_inline void gmx_simdcall
 296 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 297                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 298                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 299                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
 300                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 301 {
 302     *x1            = _mm_load_sd(p1);
 303     *y1            = _mm_load_sd(p1+1);
 304     *z1            = _mm_load_sd(p1+2);
 305     *x2            = _mm_load_sd(p1+3);
 306     *y2            = _mm_load_sd(p1+4);
 307     *z2            = _mm_load_sd(p1+5);
 308     *x3            = _mm_load_sd(p1+6);
 309     *y3            = _mm_load_sd(p1+7);
 310     *z3            = _mm_load_sd(p1+8);
 311     *x4            = _mm_load_sd(p1+9);
 312     *y4            = _mm_load_sd(p1+10);
 313     *z4            = _mm_load_sd(p1+11);
 314 }
 315
 316
 317 static gmx_inline void gmx_simdcall
 318 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
 319                                   const double * gmx_restrict ptrB,
 320                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
 321 {
 322     __m128d t1, t2, t3, t4;
 323     t1           = _mm_loadu_pd(ptrA);
 324     t2           = _mm_loadu_pd(ptrB);
 325     t3           = _mm_load_sd(ptrA+2);
 326     t4           = _mm_load_sd(ptrB+2);
 327     GMX_MM_TRANSPOSE2_PD(t1, t2);
 328     *x1          = t1;
 329     *y1          = t2;
 330     *z1          = _mm_unpacklo_pd(t3, t4);
 331 }
 332
 333 static gmx_inline void gmx_simdcall
 334 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 335                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 336                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 337                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 338 {
 339     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 340     t1           = _mm_loadu_pd(ptrA);
 341     t2           = _mm_loadu_pd(ptrB);
 342     t3           = _mm_loadu_pd(ptrA+2);
 343     t4           = _mm_loadu_pd(ptrB+2);
 344     t5           = _mm_loadu_pd(ptrA+4);
 345     t6           = _mm_loadu_pd(ptrB+4);
 346     t7           = _mm_loadu_pd(ptrA+6);
 347     t8           = _mm_loadu_pd(ptrB+6);
 348     t9           = _mm_load_sd(ptrA+8);
 349     t10          = _mm_load_sd(ptrB+8);
 350     GMX_MM_TRANSPOSE2_PD(t1, t2);
 351     GMX_MM_TRANSPOSE2_PD(t3, t4);
 352     GMX_MM_TRANSPOSE2_PD(t5, t6);
 353     GMX_MM_TRANSPOSE2_PD(t7, t8);
 354     *x1          = t1;
 355     *y1          = t2;
 356     *z1          = t3;
 357     *x2          = t4;
 358     *y2          = t5;
 359     *z2          = t6;
 360     *x3          = t7;
 361     *y3          = t8;
 362     *z3          = _mm_unpacklo_pd(t9, t10);
 363 }
 364
 365
 366 static gmx_inline void gmx_simdcall
 367 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
 368                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
 369                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
 370                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
 371                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 372 {
 373     __m128d t1, t2, t3, t4, t5, t6;
 374     t1           = _mm_loadu_pd(ptrA);
 375     t2           = _mm_loadu_pd(ptrB);
 376     t3           = _mm_loadu_pd(ptrA+2);
 377     t4           = _mm_loadu_pd(ptrB+2);
 378     t5           = _mm_loadu_pd(ptrA+4);
 379     t6           = _mm_loadu_pd(ptrB+4);
 380     GMX_MM_TRANSPOSE2_PD(t1, t2);
 381     GMX_MM_TRANSPOSE2_PD(t3, t4);
 382     GMX_MM_TRANSPOSE2_PD(t5, t6);
 383     *x1          = t1;
 384     *y1          = t2;
 385     *z1          = t3;
 386     *x2          = t4;
 387     *y2          = t5;
 388     *z2          = t6;
 389     t1           = _mm_loadu_pd(ptrA+6);
 390     t2           = _mm_loadu_pd(ptrB+6);
 391     t3           = _mm_loadu_pd(ptrA+8);
 392     t4           = _mm_loadu_pd(ptrB+8);
 393     t5           = _mm_loadu_pd(ptrA+10);
 394     t6           = _mm_loadu_pd(ptrB+10);
 395     GMX_MM_TRANSPOSE2_PD(t1, t2);
 396     GMX_MM_TRANSPOSE2_PD(t3, t4);
 397     GMX_MM_TRANSPOSE2_PD(t5, t6);
 398     *x3          = t1;
 399     *y3          = t2;
 400     *z3          = t3;
 401     *x4          = t4;
 402     *y4          = t5;
 403     *z4          = t6;
 404 }
 405
 406
 407 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 408 static gmx_inline void gmx_simdcall
 409 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 410                                        __m128d x1, __m128d y1, __m128d z1)
 411 {
 412     __m128d t1, t2, t3;
 413
 414     t1           = _mm_load_sd(ptrA);
 415     t2           = _mm_load_sd(ptrA+1);
 416     t3           = _mm_load_sd(ptrA+2);
 417
 418     t1           = _mm_sub_sd(t1, x1);
 419     t2           = _mm_sub_sd(t2, y1);
 420     t3           = _mm_sub_sd(t3, z1);
 421     _mm_store_sd(ptrA, t1);
 422     _mm_store_sd(ptrA+1, t2);
 423     _mm_store_sd(ptrA+2, t3);
 424 }
 425
 426
 427
 428 static gmx_inline void gmx_simdcall
 429 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 430                                        __m128d x1, __m128d y1, __m128d z1,
 431                                        __m128d x2, __m128d y2, __m128d z2,
 432                                        __m128d x3, __m128d y3, __m128d z3)
 433 {
 434     __m128d t1, t2, t3, t4, t5;
 435
 436     t1          = _mm_loadu_pd(ptrA);
 437     t2          = _mm_loadu_pd(ptrA+2);
 438     t3          = _mm_loadu_pd(ptrA+4);
 439     t4          = _mm_loadu_pd(ptrA+6);
 440     t5          = _mm_load_sd(ptrA+8);
 441
 442     x1          = _mm_unpacklo_pd(x1, y1);
 443     z1          = _mm_unpacklo_pd(z1, x2);
 444     y2          = _mm_unpacklo_pd(y2, z2);
 445     x3          = _mm_unpacklo_pd(x3, y3);
 446     /* nothing to be done for z3 */
 447
 448     t1          = _mm_sub_pd(t1, x1);
 449     t2          = _mm_sub_pd(t2, z1);
 450     t3          = _mm_sub_pd(t3, y2);
 451     t4          = _mm_sub_pd(t4, x3);
 452     t5          = _mm_sub_sd(t5, z3);
 453     _mm_storeu_pd(ptrA, t1);
 454     _mm_storeu_pd(ptrA+2, t2);
 455     _mm_storeu_pd(ptrA+4, t3);
 456     _mm_storeu_pd(ptrA+6, t4);
 457     _mm_store_sd(ptrA+8, t5);
 458 }
 459
 460
 461 static gmx_inline void gmx_simdcall
 462 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 463                                        __m128d x1, __m128d y1, __m128d z1,
 464                                        __m128d x2, __m128d y2, __m128d z2,
 465                                        __m128d x3, __m128d y3, __m128d z3,
 466                                        __m128d x4, __m128d y4, __m128d z4)
 467 {
 468     __m128d t1, t2, t3, t4, t5, t6;
 469
 470     t1          = _mm_loadu_pd(ptrA);
 471     t2          = _mm_loadu_pd(ptrA+2);
 472     t3          = _mm_loadu_pd(ptrA+4);
 473     t4          = _mm_loadu_pd(ptrA+6);
 474     t5          = _mm_loadu_pd(ptrA+8);
 475     t6          = _mm_loadu_pd(ptrA+10);
 476
 477     x1          = _mm_unpacklo_pd(x1, y1);
 478     z1          = _mm_unpacklo_pd(z1, x2);
 479     y2          = _mm_unpacklo_pd(y2, z2);
 480     x3          = _mm_unpacklo_pd(x3, y3);
 481     z3          = _mm_unpacklo_pd(z3, x4);
 482     y4          = _mm_unpacklo_pd(y4, z4);
 483
 484     _mm_storeu_pd(ptrA,    _mm_sub_pd( t1, x1 ));
 485     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2, z1 ));
 486     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3, y2 ));
 487     _mm_storeu_pd(ptrA+6,  _mm_sub_pd( t4, x3 ));
 488     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
 489     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
 490 }
 491
 492
 493 static gmx_inline void gmx_simdcall
 494 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 495                                        __m128d x1, __m128d y1, __m128d z1)
 496 {
 497     __m128d t1, t2, t3, t4, t5, t6, t7;
 498
 499     t1          = _mm_loadu_pd(ptrA);
 500     t2          = _mm_load_sd(ptrA+2);
 501     t3          = _mm_loadu_pd(ptrB);
 502     t4          = _mm_load_sd(ptrB+2);
 503
 504     t5          = _mm_unpacklo_pd(x1, y1);
 505     t6          = _mm_unpackhi_pd(x1, y1);
 506     t7          = _mm_unpackhi_pd(z1, z1);
 507
 508     t1          = _mm_sub_pd(t1, t5);
 509     t2          = _mm_sub_sd(t2, z1);
 510
 511     t3          = _mm_sub_pd(t3, t6);
 512     t4          = _mm_sub_sd(t4, t7);
 513
 514     _mm_storeu_pd(ptrA, t1);
 515     _mm_store_sd(ptrA+2, t2);
 516     _mm_storeu_pd(ptrB, t3);
 517     _mm_store_sd(ptrB+2, t4);
 518 }
 519
 520
 521
 522 static gmx_inline void gmx_simdcall
 523 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 524                                        __m128d x1, __m128d y1, __m128d z1,
 525                                        __m128d x2, __m128d y2, __m128d z2,
 526                                        __m128d x3, __m128d y3, __m128d z3)
 527 {
 528     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 529     __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
 530
 531     t1          = _mm_loadu_pd(ptrA);
 532     t2          = _mm_loadu_pd(ptrA+2);
 533     t3          = _mm_loadu_pd(ptrA+4);
 534     t4          = _mm_loadu_pd(ptrA+6);
 535     t5          = _mm_load_sd(ptrA+8);
 536     t6          = _mm_loadu_pd(ptrB);
 537     t7          = _mm_loadu_pd(ptrB+2);
 538     t8          = _mm_loadu_pd(ptrB+4);
 539     t9          = _mm_loadu_pd(ptrB+6);
 540     t10         = _mm_load_sd(ptrB+8);
 541
 542     tA          = _mm_unpacklo_pd(x1, y1);
 543     tB          = _mm_unpackhi_pd(x1, y1);
 544     tC          = _mm_unpacklo_pd(z1, x2);
 545     tD          = _mm_unpackhi_pd(z1, x2);
 546     tE          = _mm_unpacklo_pd(y2, z2);
 547     tF          = _mm_unpackhi_pd(y2, z2);
 548     tG          = _mm_unpacklo_pd(x3, y3);
 549     tH          = _mm_unpackhi_pd(x3, y3);
 550     tI          = _mm_unpackhi_pd(z3, z3);
 551
 552     t1          = _mm_sub_pd(t1, tA);
 553     t2          = _mm_sub_pd(t2, tC);
 554     t3          = _mm_sub_pd(t3, tE);
 555     t4          = _mm_sub_pd(t4, tG);
 556     t5          = _mm_sub_sd(t5, z3);
 557
 558     t6          = _mm_sub_pd(t6, tB);
 559     t7          = _mm_sub_pd(t7, tD);
 560     t8          = _mm_sub_pd(t8, tF);
 561     t9          = _mm_sub_pd(t9, tH);
 562     t10         = _mm_sub_sd(t10, tI);
 563
 564     _mm_storeu_pd(ptrA, t1);
 565     _mm_storeu_pd(ptrA+2, t2);
 566     _mm_storeu_pd(ptrA+4, t3);
 567     _mm_storeu_pd(ptrA+6, t4);
 568     _mm_store_sd(ptrA+8, t5);
 569     _mm_storeu_pd(ptrB, t6);
 570     _mm_storeu_pd(ptrB+2, t7);
 571     _mm_storeu_pd(ptrB+4, t8);
 572     _mm_storeu_pd(ptrB+6, t9);
 573     _mm_store_sd(ptrB+8, t10);
 574 }
 575
 576
 577 static gmx_inline void gmx_simdcall
 578 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
 579                                        __m128d x1, __m128d y1, __m128d z1,
 580                                        __m128d x2, __m128d y2, __m128d z2,
 581                                        __m128d x3, __m128d y3, __m128d z3,
 582                                        __m128d x4, __m128d y4, __m128d z4)
 583 {
 584     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 585     __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
 586
 587     t1          = _mm_loadu_pd(ptrA);
 588     t2          = _mm_loadu_pd(ptrA+2);
 589     t3          = _mm_loadu_pd(ptrA+4);
 590     t4          = _mm_loadu_pd(ptrA+6);
 591     t5          = _mm_loadu_pd(ptrA+8);
 592     t6          = _mm_loadu_pd(ptrA+10);
 593     t7          = _mm_loadu_pd(ptrB);
 594     t8          = _mm_loadu_pd(ptrB+2);
 595     t9          = _mm_loadu_pd(ptrB+4);
 596     t10         = _mm_loadu_pd(ptrB+6);
 597     t11         = _mm_loadu_pd(ptrB+8);
 598     t12         = _mm_loadu_pd(ptrB+10);
 599
 600     tA          = _mm_unpacklo_pd(x1, y1);
 601     tB          = _mm_unpackhi_pd(x1, y1);
 602     tC          = _mm_unpacklo_pd(z1, x2);
 603     tD          = _mm_unpackhi_pd(z1, x2);
 604     tE          = _mm_unpacklo_pd(y2, z2);
 605     tF          = _mm_unpackhi_pd(y2, z2);
 606     tG          = _mm_unpacklo_pd(x3, y3);
 607     tH          = _mm_unpackhi_pd(x3, y3);
 608     tI          = _mm_unpacklo_pd(z3, x4);
 609     tJ          = _mm_unpackhi_pd(z3, x4);
 610     tK          = _mm_unpacklo_pd(y4, z4);
 611     tL          = _mm_unpackhi_pd(y4, z4);
 612
 613     t1          = _mm_sub_pd(t1, tA);
 614     t2          = _mm_sub_pd(t2, tC);
 615     t3          = _mm_sub_pd(t3, tE);
 616     t4          = _mm_sub_pd(t4, tG);
 617     t5          = _mm_sub_pd(t5, tI);
 618     t6          = _mm_sub_pd(t6, tK);
 619
 620     t7          = _mm_sub_pd(t7, tB);
 621     t8          = _mm_sub_pd(t8, tD);
 622     t9          = _mm_sub_pd(t9, tF);
 623     t10         = _mm_sub_pd(t10, tH);
 624     t11         = _mm_sub_pd(t11, tJ);
 625     t12         = _mm_sub_pd(t12, tL);
 626
 627     _mm_storeu_pd(ptrA,  t1);
 628     _mm_storeu_pd(ptrA+2, t2);
 629     _mm_storeu_pd(ptrA+4, t3);
 630     _mm_storeu_pd(ptrA+6, t4);
 631     _mm_storeu_pd(ptrA+8, t5);
 632     _mm_storeu_pd(ptrA+10, t6);
 633     _mm_storeu_pd(ptrB,  t7);
 634     _mm_storeu_pd(ptrB+2, t8);
 635     _mm_storeu_pd(ptrB+4, t9);
 636     _mm_storeu_pd(ptrB+6, t10);
 637     _mm_storeu_pd(ptrB+8, t11);
 638     _mm_storeu_pd(ptrB+10, t12);
 639 }
 640
 641
 642 static gmx_inline void gmx_simdcall
 643 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
 644                                       double * gmx_restrict fptr,
 645                                       double * gmx_restrict fshiftptr)
 646 {
 647     fix1 = _mm_hadd_pd(fix1, fiy1);
 648     fiz1 = _mm_hadd_pd(fiz1, fiz1);
 649
 650     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
 651     _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
 652
 653     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
 654     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 655 }
 656
 657
 658 static gmx_inline void gmx_simdcall
 659 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
 660                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
 661                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
 662                                       double * gmx_restrict fptr,
 663                                       double * gmx_restrict fshiftptr)
 664 {
 665     __m128d t1, t2;
 666
 667     fix1 = _mm_hadd_pd(fix1, fiy1);
 668     fiz1 = _mm_hadd_pd(fiz1, fix2);
 669     fiy2 = _mm_hadd_pd(fiy2, fiz2);
 670     fix3 = _mm_hadd_pd(fix3, fiy3);
 671     fiz3 = _mm_hadd_pd(fiz3, fiz3);
 672
 673     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
 674     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
 675     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
 676     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
 677     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
 678
 679     fix1 = _mm_add_pd(fix1, fix3);
 680     t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
 681     fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
 682
 683     t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
 684     fiz1 = _mm_add_sd(fiz1, fiz3);
 685     fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
 686
 687     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
 688     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 689 }
 690
 691
 692 static gmx_inline void gmx_simdcall
 693 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
 694                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
 695                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
 696                                       __m128d fix4, __m128d fiy4, __m128d fiz4,
 697                                       double * gmx_restrict fptr,
 698                                       double * gmx_restrict fshiftptr)
 699 {
 700     __m128d t1, t2;
 701
 702     fix1 = _mm_hadd_pd(fix1, fiy1);
 703     fiz1 = _mm_hadd_pd(fiz1, fix2);
 704     fiy2 = _mm_hadd_pd(fiy2, fiz2);
 705     fix3 = _mm_hadd_pd(fix3, fiy3);
 706     fiz3 = _mm_hadd_pd(fiz3, fix4);
 707     fiy4 = _mm_hadd_pd(fiy4, fiz4);
 708
 709     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
 710     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
 711     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
 712     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
 713     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
 714     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
 715
 716     t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
 717     fix1 = _mm_add_pd(fix1, t1);
 718     t2   = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
 719     fix3 = _mm_add_pd(fix3, t2);
 720     fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
 721
 722     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
 723     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
 724     fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
 725
 726     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
 727     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 728 }
 729
 730
 731 static gmx_inline void gmx_simdcall
 732 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
 733 {
 734     pot1 = _mm_hadd_pd(pot1, pot1);
 735     _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
 736 }
 737
 738 static gmx_inline void gmx_simdcall
 739 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
 740                       __m128d pot2, double * gmx_restrict ptrB)
 741 {
 742     pot1 = _mm_hadd_pd(pot1, pot2);
 743     pot2 = _mm_unpackhi_pd(pot1, pot1);
 744
 745     _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
 746     _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
 747 }
 748
 749
 750 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */