src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
  30 #define _kernelutil_x86_avx_128_fma_single_h_
  31
  32
  33 #include <math.h>
  34
  35 #include "gmx_x86_avx_128_fma.h"
  36
  37 /* Normal sum of four xmm registers */
  38 #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
  39
  40 static gmx_inline int
  41 gmx_mm_any_lt(__m128 a, __m128 b)
  42 {
  43     return _mm_movemask_ps(_mm_cmplt_ps(a, b));
  44 }
  45
  46 static gmx_inline __m128
  47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  48 {
  49     return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
  50 }
  51
  52 /* Load a single value from 1-4 places, merge into xmm register */
  53
  54 static gmx_inline __m128
  55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
  56                              const float * gmx_restrict ptrB,
  57                              const float * gmx_restrict ptrC,
  58                              const float * gmx_restrict ptrD)
  59 {
  60     __m128 t1, t2;
  61
  62     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
  63     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
  64     return _mm_unpacklo_ps(t1, t2);
  65 }
  66
  67
  68 static gmx_inline void
  69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
  70                               float * gmx_restrict ptrB,
  71                               float * gmx_restrict ptrC,
  72                               float * gmx_restrict ptrD, __m128 xmm1)
  73 {
  74     __m128 t2, t3, t4;
  75
  76     t2       = _mm_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
  77     t3       = _mm_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
  78     t4       = _mm_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
  79     _mm_store_ss(ptrA, xmm1);
  80     _mm_store_ss(ptrB, t2);
  81     _mm_store_ss(ptrC, t3);
  82     _mm_store_ss(ptrD, t4);
  83 }
  84
  85
  86 static gmx_inline void
  87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
  88                                   float * gmx_restrict ptrB,
  89                                   float * gmx_restrict ptrC,
  90                                   float * gmx_restrict ptrD, __m128 xmm1)
  91 {
  92     __m128 tmp;
  93
  94     tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
  95     tmp = _mm_add_ps(tmp, xmm1);
  96     gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
  97 }
  98
  99
 100 static gmx_inline void
 101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 102                              const float * gmx_restrict p2,
 103                              const float * gmx_restrict p3,
 104                              const float * gmx_restrict p4,
 105                              __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
 106 {
 107     __m128 t1, t2, t3, t4;
 108     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);
 109     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);
 110     t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);
 111     t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);
 112     t1   = _mm_unpacklo_ps(t1, t3);
 113     t2   = _mm_unpacklo_ps(t2, t4);
 114     *c6  = _mm_unpacklo_ps(t1, t2);
 115     *c12 = _mm_unpackhi_ps(t1, t2);
 116 }
 117
 118
 119
 120
 121 static gmx_inline void
 122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 123                                          const float * gmx_restrict xyz,
 124                                          __m128 * gmx_restrict      x1,
 125                                          __m128 * gmx_restrict      y1,
 126                                          __m128 * gmx_restrict      z1)
 127 {
 128     __m128 t1, t2, t3, t4;
 129
 130     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 131     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
 132     t3   = _mm_load_ss(xyz_shift+2);
 133     t4   = _mm_load_ss(xyz+2);
 134     t1   = _mm_add_ps(t1, t2);
 135     t3   = _mm_add_ss(t3, t4);
 136
 137     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 138     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 139     *z1  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 140 }
 141
 142
 143 static gmx_inline void
 144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 145                                          const float * gmx_restrict xyz,
 146                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 147                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 148                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 149 {
 150     __m128 tA, tB;
 151     __m128 t1, t2, t3, t4, t5, t6;
 152
 153     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 154     tB   = _mm_load_ss(xyz_shift+2);
 155
 156     t1   = _mm_loadu_ps(xyz);
 157     t2   = _mm_loadu_ps(xyz+4);
 158     t3   = _mm_load_ss(xyz+8);
 159
 160     tA   = _mm_movelh_ps(tA, tB);
 161     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 162     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 163     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 164
 165     t1   = _mm_add_ps(t1, t4);
 166     t2   = _mm_add_ps(t2, t5);
 167     t3   = _mm_add_ss(t3, t6);
 168
 169     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 170     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 171     *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 172     *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 173     *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 174     *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 175     *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 176     *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 177     *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 178 }
 179
 180
 181 static gmx_inline void
 182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 183                                          const float * gmx_restrict xyz,
 184                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 185                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 186                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
 187                                          __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 188 {
 189     __m128 tA, tB;
 190     __m128 t1, t2, t3, t4, t5, t6;
 191
 192     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
 193     tB   = _mm_load_ss(xyz_shift+2);
 194
 195     t1   = _mm_loadu_ps(xyz);
 196     t2   = _mm_loadu_ps(xyz+4);
 197     t3   = _mm_loadu_ps(xyz+8);
 198
 199     tA   = _mm_movelh_ps(tA, tB);
 200     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
 201     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
 202     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
 203
 204     t1   = _mm_add_ps(t1, t4);
 205     t2   = _mm_add_ps(t2, t5);
 206     t3   = _mm_add_ps(t3, t6);
 207
 208     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
 209     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
 210     *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
 211     *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
 212     *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
 213     *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
 214     *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
 215     *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
 216     *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
 217     *x4  = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
 218     *y4  = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
 219     *z4  = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
 220 }
 221
 222
 223 static gmx_inline void
 224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 225                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 226                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
 227 {
 228     __m128  t1, t2, t3, t4;
 229     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 230     t1             = gmx_mm_maskload_ps(ptrA, mask);
 231     t2             = gmx_mm_maskload_ps(ptrB, mask);
 232     t3             = gmx_mm_maskload_ps(ptrC, mask);
 233     t4             = gmx_mm_maskload_ps(ptrD, mask);
 234     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 235     *x1           = t1;
 236     *y1           = t2;
 237     *z1           = t3;
 238 }
 239
 240
 241 static gmx_inline void
 242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 243                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 244                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 245                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 246                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 247 {
 248     __m128 t1, t2, t3, t4;
 249     t1            = _mm_loadu_ps(ptrA);
 250     t2            = _mm_loadu_ps(ptrB);
 251     t3            = _mm_loadu_ps(ptrC);
 252     t4            = _mm_loadu_ps(ptrD);
 253     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 254     *x1           = t1;
 255     *y1           = t2;
 256     *z1           = t3;
 257     *x2           = t4;
 258     t1            = _mm_loadu_ps(ptrA+4);
 259     t2            = _mm_loadu_ps(ptrB+4);
 260     t3            = _mm_loadu_ps(ptrC+4);
 261     t4            = _mm_loadu_ps(ptrD+4);
 262     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 263     *y2           = t1;
 264     *z2           = t2;
 265     *x3           = t3;
 266     *y3           = t4;
 267     t1            = _mm_load_ss(ptrA+8);
 268     t2            = _mm_load_ss(ptrB+8);
 269     t3            = _mm_load_ss(ptrC+8);
 270     t4            = _mm_load_ss(ptrD+8);
 271     t1            = _mm_unpacklo_ps(t1, t3);
 272     t3            = _mm_unpacklo_ps(t2, t4);
 273     *z3           = _mm_unpacklo_ps(t1, t3);
 274 }
 275
 276
 277 static gmx_inline void
 278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 279                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 280                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 281                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 282                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
 283                                   __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 284 {
 285     __m128 t1, t2, t3, t4;
 286     t1            = _mm_loadu_ps(ptrA);
 287     t2            = _mm_loadu_ps(ptrB);
 288     t3            = _mm_loadu_ps(ptrC);
 289     t4            = _mm_loadu_ps(ptrD);
 290     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 291     *x1           = t1;
 292     *y1           = t2;
 293     *z1           = t3;
 294     *x2           = t4;
 295     t1            = _mm_loadu_ps(ptrA+4);
 296     t2            = _mm_loadu_ps(ptrB+4);
 297     t3            = _mm_loadu_ps(ptrC+4);
 298     t4            = _mm_loadu_ps(ptrD+4);
 299     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 300     *y2           = t1;
 301     *z2           = t2;
 302     *x3           = t3;
 303     *y3           = t4;
 304     t1            = _mm_loadu_ps(ptrA+8);
 305     t2            = _mm_loadu_ps(ptrB+8);
 306     t3            = _mm_loadu_ps(ptrC+8);
 307     t4            = _mm_loadu_ps(ptrD+8);
 308     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
 309     *z3           = t1;
 310     *x4           = t2;
 311     *y4           = t3;
 312     *z4           = t4;
 313 }
 314
 315
 316 static gmx_inline void
 317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 318                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 319                                        __m128 x1, __m128 y1, __m128 z1)
 320 {
 321     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
 322     t5          = _mm_unpacklo_ps(y1, z1);
 323     t6          = _mm_unpackhi_ps(y1, z1);
 324     t7          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
 325     t8          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
 326     t9          = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
 327     t10         = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
 328     t1          = _mm_load_ss(ptrA);
 329     t1          = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
 330     t1          = _mm_sub_ps(t1, t7);
 331     _mm_store_ss(ptrA, t1);
 332     _mm_storeh_pi((__m64 *)(ptrA+1), t1);
 333     t2          = _mm_load_ss(ptrB);
 334     t2          = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
 335     t2          = _mm_sub_ps(t2, t8);
 336     _mm_store_ss(ptrB, t2);
 337     _mm_storeh_pi((__m64 *)(ptrB+1), t2);
 338     t3          = _mm_load_ss(ptrC);
 339     t3          = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
 340     t3          = _mm_sub_ps(t3, t9);
 341     _mm_store_ss(ptrC, t3);
 342     _mm_storeh_pi((__m64 *)(ptrC+1), t3);
 343     t4          = _mm_load_ss(ptrD);
 344     t4          = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
 345     t4          = _mm_sub_ps(t4, t10);
 346     _mm_store_ss(ptrD, t4);
 347     _mm_storeh_pi((__m64 *)(ptrD+1), t4);
 348 }
 349
 350
 351 #if defined (_MSC_VER) && defined(_M_IX86)
 352 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 353 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 354                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
 355     { \
 356         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
 357         __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
 358         __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
 359         _t13         = _mm_unpackhi_ps(_x1, _y1); \
 360         _x1          = _mm_unpacklo_ps(_x1, _y1); \
 361         _t14         = _mm_unpackhi_ps(_z1, _x2); \
 362         _z1          = _mm_unpacklo_ps(_z1, _x2); \
 363         _t15         = _mm_unpackhi_ps(_y2, _z2); \
 364         _y2          = _mm_unpacklo_ps(_y2, _z2); \
 365         _t16         = _mm_unpackhi_ps(_x3, _y3); \
 366         _x3          = _mm_unpacklo_ps(_x3, _y3); \
 367         _t17         = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
 368         _t18         = _mm_movehl_ps(_z3, _z3); \
 369         _t19         = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
 370         _t20         = _mm_movelh_ps(_x1, _z1); \
 371         _t21         = _mm_movehl_ps(_z1, _x1); \
 372         _t22         = _mm_movelh_ps(_t13, _t14); \
 373         _t14         = _mm_movehl_ps(_t14, _t13); \
 374         _t23         = _mm_movelh_ps(_y2, _x3); \
 375         _t24         = _mm_movehl_ps(_x3, _y2); \
 376         _t25         = _mm_movelh_ps(_t15, _t16); \
 377         _t16         = _mm_movehl_ps(_t16, _t15); \
 378         _t1          = _mm_loadu_ps(ptrA); \
 379         _t2          = _mm_loadu_ps(ptrA+4); \
 380         _t3          = _mm_load_ss(ptrA+8); \
 381         _t1          = _mm_sub_ps(_t1, _t20); \
 382         _t2          = _mm_sub_ps(_t2, _t23); \
 383         _t3          = _mm_sub_ss(_t3, _z3); \
 384         _mm_storeu_ps(ptrA, _t1); \
 385         _mm_storeu_ps(ptrA+4, _t2); \
 386         _mm_store_ss(ptrA+8, _t3); \
 387         _t4          = _mm_loadu_ps(ptrB); \
 388         _t5          = _mm_loadu_ps(ptrB+4); \
 389         _t6          = _mm_load_ss(ptrB+8); \
 390         _t4          = _mm_sub_ps(_t4, _t21); \
 391         _t5          = _mm_sub_ps(_t5, _t24); \
 392         _t6          = _mm_sub_ss(_t6, _t17); \
 393         _mm_storeu_ps(ptrB, _t4); \
 394         _mm_storeu_ps(ptrB+4, _t5); \
 395         _mm_store_ss(ptrB+8, _t6); \
 396         _t7          = _mm_loadu_ps(ptrC); \
 397         _t8          = _mm_loadu_ps(ptrC+4); \
 398         _t9          = _mm_load_ss(ptrC+8); \
 399         _t7          = _mm_sub_ps(_t7, _t22); \
 400         _t8          = _mm_sub_ps(_t8, _t25); \
 401         _t9          = _mm_sub_ss(_t9, _t18); \
 402         _mm_storeu_ps(ptrC, _t7); \
 403         _mm_storeu_ps(ptrC+4, _t8); \
 404         _mm_store_ss(ptrC+8, _t9); \
 405         _t10         = _mm_loadu_ps(ptrD); \
 406         _t11         = _mm_loadu_ps(ptrD+4); \
 407         _t12         = _mm_load_ss(ptrD+8); \
 408         _t10         = _mm_sub_ps(_t10, _t14); \
 409         _t11         = _mm_sub_ps(_t11, _t16); \
 410         _t12         = _mm_sub_ss(_t12, _t19); \
 411         _mm_storeu_ps(ptrD, _t10); \
 412         _mm_storeu_ps(ptrD+4, _t11); \
 413         _mm_store_ss(ptrD+8, _t12); \
 414     }
 415 #else
 416 /* Real function for sane compilers */
 417 static gmx_inline void
 418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 419                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 420                                        __m128 x1, __m128 y1, __m128 z1,
 421                                        __m128 x2, __m128 y2, __m128 z2,
 422                                        __m128 x3, __m128 y3, __m128 z3)
 423 {
 424     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 425     __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
 426     __m128 t20, t21, t22, t23, t24, t25;
 427     t13         = _mm_unpackhi_ps(x1, y1);
 428     x1          = _mm_unpacklo_ps(x1, y1);
 429     t14         = _mm_unpackhi_ps(z1, x2);
 430     z1          = _mm_unpacklo_ps(z1, x2);
 431     t15         = _mm_unpackhi_ps(y2, z2);
 432     y2          = _mm_unpacklo_ps(y2, z2);
 433     t16         = _mm_unpackhi_ps(x3, y3);
 434     x3          = _mm_unpacklo_ps(x3, y3);
 435     t17         = _mm_permute_ps(z3, _MM_SHUFFLE(0, 0, 0, 1));
 436     t18         = _mm_movehl_ps(z3, z3);
 437     t19         = _mm_permute_ps(t18, _MM_SHUFFLE(0, 0, 0, 1));
 438     t20         = _mm_movelh_ps(x1, z1);
 439     t21         = _mm_movehl_ps(z1, x1);
 440     t22         = _mm_movelh_ps(t13, t14);
 441     t14         = _mm_movehl_ps(t14, t13);
 442     t23         = _mm_movelh_ps(y2, x3);
 443     t24         = _mm_movehl_ps(x3, y2);
 444     t25         = _mm_movelh_ps(t15, t16);
 445     t16         = _mm_movehl_ps(t16, t15);
 446     t1          = _mm_loadu_ps(ptrA);
 447     t2          = _mm_loadu_ps(ptrA+4);
 448     t3          = _mm_load_ss(ptrA+8);
 449     t1          = _mm_sub_ps(t1, t20);
 450     t2          = _mm_sub_ps(t2, t23);
 451     t3          = _mm_sub_ss(t3, z3);
 452     _mm_storeu_ps(ptrA, t1);
 453     _mm_storeu_ps(ptrA+4, t2);
 454     _mm_store_ss(ptrA+8, t3);
 455     t4          = _mm_loadu_ps(ptrB);
 456     t5          = _mm_loadu_ps(ptrB+4);
 457     t6          = _mm_load_ss(ptrB+8);
 458     t4          = _mm_sub_ps(t4, t21);
 459     t5          = _mm_sub_ps(t5, t24);
 460     t6          = _mm_sub_ss(t6, t17);
 461     _mm_storeu_ps(ptrB, t4);
 462     _mm_storeu_ps(ptrB+4, t5);
 463     _mm_store_ss(ptrB+8, t6);
 464     t7          = _mm_loadu_ps(ptrC);
 465     t8          = _mm_loadu_ps(ptrC+4);
 466     t9          = _mm_load_ss(ptrC+8);
 467     t7          = _mm_sub_ps(t7, t22);
 468     t8          = _mm_sub_ps(t8, t25);
 469     t9          = _mm_sub_ss(t9, t18);
 470     _mm_storeu_ps(ptrC, t7);
 471     _mm_storeu_ps(ptrC+4, t8);
 472     _mm_store_ss(ptrC+8, t9);
 473     t10         = _mm_loadu_ps(ptrD);
 474     t11         = _mm_loadu_ps(ptrD+4);
 475     t12         = _mm_load_ss(ptrD+8);
 476     t10         = _mm_sub_ps(t10, t14);
 477     t11         = _mm_sub_ps(t11, t16);
 478     t12         = _mm_sub_ss(t12, t19);
 479     _mm_storeu_ps(ptrD, t10);
 480     _mm_storeu_ps(ptrD+4, t11);
 481     _mm_store_ss(ptrD+8, t12);
 482 }
 483 #endif
 484
 485 #if defined (_MSC_VER) && defined(_M_IX86)
 486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 487 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
 488                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
 489     { \
 490         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
 491         __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
 492         __m128 _t23, _t24; \
 493         _t13         = _mm_unpackhi_ps(_x1, _y1); \
 494         _x1          = _mm_unpacklo_ps(_x1, _y1); \
 495         _t14         = _mm_unpackhi_ps(_z1, _x2); \
 496         _z1          = _mm_unpacklo_ps(_z1, _x2); \
 497         _t15         = _mm_unpackhi_ps(_y2, _z2); \
 498         _y2          = _mm_unpacklo_ps(_y2, _z2); \
 499         _t16         = _mm_unpackhi_ps(_x3, _y3); \
 500         _x3          = _mm_unpacklo_ps(_x3, _y3); \
 501         _t17         = _mm_unpackhi_ps(_z3, _x4); \
 502         _z3          = _mm_unpacklo_ps(_z3, _x4); \
 503         _t18         = _mm_unpackhi_ps(_y4, _z4); \
 504         _y4          = _mm_unpacklo_ps(_y4, _z4); \
 505         _t19         = _mm_movelh_ps(_x1, _z1); \
 506         _z1          = _mm_movehl_ps(_z1, _x1); \
 507         _t20         = _mm_movelh_ps(_t13, _t14); \
 508         _t14         = _mm_movehl_ps(_t14, _t13); \
 509         _t21         = _mm_movelh_ps(_y2, _x3); \
 510         _x3          = _mm_movehl_ps(_x3, _y2); \
 511         _t22         = _mm_movelh_ps(_t15, _t16); \
 512         _t16         = _mm_movehl_ps(_t16, _t15); \
 513         _t23         = _mm_movelh_ps(_z3, _y4); \
 514         _y4          = _mm_movehl_ps(_y4, _z3); \
 515         _t24         = _mm_movelh_ps(_t17, _t18); \
 516         _t18         = _mm_movehl_ps(_t18, _t17); \
 517         _t1          = _mm_loadu_ps(ptrA); \
 518         _t2          = _mm_loadu_ps(ptrA+4); \
 519         _t3          = _mm_loadu_ps(ptrA+8); \
 520         _t1          = _mm_sub_ps(_t1, _t19); \
 521         _t2          = _mm_sub_ps(_t2, _t21); \
 522         _t3          = _mm_sub_ps(_t3, _t23); \
 523         _mm_storeu_ps(ptrA, _t1); \
 524         _mm_storeu_ps(ptrA+4, _t2); \
 525         _mm_storeu_ps(ptrA+8, _t3); \
 526         _t4          = _mm_loadu_ps(ptrB); \
 527         _t5          = _mm_loadu_ps(ptrB+4); \
 528         _t6          = _mm_loadu_ps(ptrB+8); \
 529         _t4          = _mm_sub_ps(_t4, _z1); \
 530         _t5          = _mm_sub_ps(_t5, _x3); \
 531         _t6          = _mm_sub_ps(_t6, _y4); \
 532         _mm_storeu_ps(ptrB, _t4); \
 533         _mm_storeu_ps(ptrB+4, _t5); \
 534         _mm_storeu_ps(ptrB+8, _t6); \
 535         _t7          = _mm_loadu_ps(ptrC); \
 536         _t8          = _mm_loadu_ps(ptrC+4); \
 537         _t9          = _mm_loadu_ps(ptrC+8); \
 538         _t7          = _mm_sub_ps(_t7, _t20); \
 539         _t8          = _mm_sub_ps(_t8, _t22); \
 540         _t9          = _mm_sub_ps(_t9, _t24); \
 541         _mm_storeu_ps(ptrC, _t7); \
 542         _mm_storeu_ps(ptrC+4, _t8); \
 543         _mm_storeu_ps(ptrC+8, _t9); \
 544         _t10         = _mm_loadu_ps(ptrD); \
 545         _t11         = _mm_loadu_ps(ptrD+4); \
 546         _t12         = _mm_loadu_ps(ptrD+8); \
 547         _t10         = _mm_sub_ps(_t10, _t14); \
 548         _t11         = _mm_sub_ps(_t11, _t16); \
 549         _t12         = _mm_sub_ps(_t12, _t18); \
 550         _mm_storeu_ps(ptrD, _t10); \
 551         _mm_storeu_ps(ptrD+4, _t11); \
 552         _mm_storeu_ps(ptrD+8, _t12); \
 553     }
 554 #else
 555 /* Real function for sane compilers */
 556 static gmx_inline void
 557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 558                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 559                                        __m128 x1, __m128 y1, __m128 z1,
 560                                        __m128 x2, __m128 y2, __m128 z2,
 561                                        __m128 x3, __m128 y3, __m128 z3,
 562                                        __m128 x4, __m128 y4, __m128 z4)
 563 {
 564     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
 565     __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
 566     __m128 t23, t24;
 567     t13         = _mm_unpackhi_ps(x1, y1);
 568     x1          = _mm_unpacklo_ps(x1, y1);
 569     t14         = _mm_unpackhi_ps(z1, x2);
 570     z1          = _mm_unpacklo_ps(z1, x2);
 571     t15         = _mm_unpackhi_ps(y2, z2);
 572     y2          = _mm_unpacklo_ps(y2, z2);
 573     t16         = _mm_unpackhi_ps(x3, y3);
 574     x3          = _mm_unpacklo_ps(x3, y3);
 575     t17         = _mm_unpackhi_ps(z3, x4);
 576     z3          = _mm_unpacklo_ps(z3, x4);
 577     t18         = _mm_unpackhi_ps(y4, z4);
 578     y4          = _mm_unpacklo_ps(y4, z4);
 579     t19         = _mm_movelh_ps(x1, z1);
 580     z1          = _mm_movehl_ps(z1, x1);
 581     t20         = _mm_movelh_ps(t13, t14);
 582     t14         = _mm_movehl_ps(t14, t13);
 583     t21         = _mm_movelh_ps(y2, x3);
 584     x3          = _mm_movehl_ps(x3, y2);
 585     t22         = _mm_movelh_ps(t15, t16);
 586     t16         = _mm_movehl_ps(t16, t15);
 587     t23         = _mm_movelh_ps(z3, y4);
 588     y4          = _mm_movehl_ps(y4, z3);
 589     t24         = _mm_movelh_ps(t17, t18);
 590     t18         = _mm_movehl_ps(t18, t17);
 591     t1          = _mm_loadu_ps(ptrA);
 592     t2          = _mm_loadu_ps(ptrA+4);
 593     t3          = _mm_loadu_ps(ptrA+8);
 594     t1          = _mm_sub_ps(t1, t19);
 595     t2          = _mm_sub_ps(t2, t21);
 596     t3          = _mm_sub_ps(t3, t23);
 597     _mm_storeu_ps(ptrA, t1);
 598     _mm_storeu_ps(ptrA+4, t2);
 599     _mm_storeu_ps(ptrA+8, t3);
 600     t4          = _mm_loadu_ps(ptrB);
 601     t5          = _mm_loadu_ps(ptrB+4);
 602     t6          = _mm_loadu_ps(ptrB+8);
 603     t4          = _mm_sub_ps(t4, z1);
 604     t5          = _mm_sub_ps(t5, x3);
 605     t6          = _mm_sub_ps(t6, y4);
 606     _mm_storeu_ps(ptrB, t4);
 607     _mm_storeu_ps(ptrB+4, t5);
 608     _mm_storeu_ps(ptrB+8, t6);
 609     t7          = _mm_loadu_ps(ptrC);
 610     t8          = _mm_loadu_ps(ptrC+4);
 611     t9          = _mm_loadu_ps(ptrC+8);
 612     t7          = _mm_sub_ps(t7, t20);
 613     t8          = _mm_sub_ps(t8, t22);
 614     t9          = _mm_sub_ps(t9, t24);
 615     _mm_storeu_ps(ptrC, t7);
 616     _mm_storeu_ps(ptrC+4, t8);
 617     _mm_storeu_ps(ptrC+8, t9);
 618     t10         = _mm_loadu_ps(ptrD);
 619     t11         = _mm_loadu_ps(ptrD+4);
 620     t12         = _mm_loadu_ps(ptrD+8);
 621     t10         = _mm_sub_ps(t10, t14);
 622     t11         = _mm_sub_ps(t11, t16);
 623     t12         = _mm_sub_ps(t12, t18);
 624     _mm_storeu_ps(ptrD, t10);
 625     _mm_storeu_ps(ptrD+4, t11);
 626     _mm_storeu_ps(ptrD+8, t12);
 627 }
 628 #endif
 629
 630 static gmx_inline void
 631 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 632                                       float * gmx_restrict fptr,
 633                                       float * gmx_restrict fshiftptr)
 634 {
 635     __m128 t2, t3;
 636
 637     fix1 = _mm_hadd_ps(fix1, fix1);
 638     fiy1 = _mm_hadd_ps(fiy1, fiz1);
 639
 640     fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */
 641
 642     t2 = _mm_load_ss(fptr);
 643     t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
 644     t3 = _mm_load_ss(fshiftptr);
 645     t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
 646
 647     t2 = _mm_add_ps(t2, fix1);
 648     t3 = _mm_add_ps(t3, fix1);
 649
 650     _mm_store_ss(fptr, t2);
 651     _mm_storeh_pi((__m64 *)(fptr+1), t2);
 652     _mm_store_ss(fshiftptr, t3);
 653     _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
 654 }
 655
 656 #if defined (_MSC_VER) && defined(_M_IX86)
 657 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 658 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
 659                                               fptr, fshiftptr) \
 660     { \
 661         __m128 _t1, _t2, _t3, _t4; \
 662 \
 663         fix1 = _mm_hadd_ps(fix1, fiy1); \
 664         fiz1 = _mm_hadd_ps(fiz1, fix2); \
 665         fiy2 = _mm_hadd_ps(fiy2, fiz2); \
 666         fix3 = _mm_hadd_ps(fix3, fiy3); \
 667         fiz3 = _mm_hadd_ps(fiz3, fiz3); \
 668         fix1 = _mm_hadd_ps(fix1, fiz1); \
 669         fiy2 = _mm_hadd_ps(fiy2, fix3); \
 670         fiz3 = _mm_hadd_ps(fiz3, fiz3); \
 671         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
 672         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
 673         _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
 674         _t4 = _mm_load_ss(fshiftptr+2); \
 675         _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
 676         _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
 677         _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
 678         _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
 679         _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
 680         _t1 = _mm_add_ps(_t1, _t2); \
 681         _t3 = _mm_add_ps(_t3, _t4); \
 682         _t1 = _mm_add_ps(_t1, _t3); \
 683         _mm_store_ss(fshiftptr+2, _t1); \
 684         _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
 685     }
 686 #else
 687 /* Real function for sane compilers */
 688 static gmx_inline void
 689 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 690                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
 691                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
 692                                       float * gmx_restrict fptr,
 693                                       float * gmx_restrict fshiftptr)
 694 {
 695     __m128 t1, t2, t3, t4;
 696
 697     fix1 = _mm_hadd_ps(fix1, fiy1);
 698     fiz1 = _mm_hadd_ps(fiz1, fix2);
 699     fiy2 = _mm_hadd_ps(fiy2, fiz2);
 700     fix3 = _mm_hadd_ps(fix3, fiy3);
 701     fiz3 = _mm_hadd_ps(fiz3, fiz3);
 702
 703     fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
 704     fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
 705     fiz3 = _mm_hadd_ps(fiz3, fiz3); /*  -    -    -   fiz3 */
 706
 707     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
 708     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
 709     _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
 710
 711     t4 = _mm_load_ss(fshiftptr+2);
 712     t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
 713
 714     t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1  -   fiz3 */
 715     t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3  -   fiz1 */
 716     t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
 717     t3 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 2, 0, 0));         /* fiy2 fix2  -   fiz2 */
 718
 719     t1 = _mm_add_ps(t1, t2);
 720     t3 = _mm_add_ps(t3, t4);
 721     t1 = _mm_add_ps(t1, t3); /* y x - z */
 722
 723     _mm_store_ss(fshiftptr+2, t1);
 724     _mm_storeh_pi((__m64 *)(fshiftptr), t1);
 725 }
 726 #endif
 727
 728 #if defined (_MSC_VER) && defined(_M_IX86)
 729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 730 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
 731                                               fptr, fshiftptr) \
 732     { \
 733         __m128 _t1, _t2, _t3, _t4, _t5; \
 734 \
 735         fix1 = _mm_hadd_ps(fix1, fiy1); \
 736         fiz1 = _mm_hadd_ps(fiz1, fix2); \
 737         fiy2 = _mm_hadd_ps(fiy2, fiz2); \
 738         fix3 = _mm_hadd_ps(fix3, fiy3); \
 739         fiz3 = _mm_hadd_ps(fiz3, fix4); \
 740         fiy4 = _mm_hadd_ps(fiy4, fiz4); \
 741         fix1 = _mm_hadd_ps(fix1, fiz1); \
 742         fiy2 = _mm_hadd_ps(fiy2, fix3); \
 743         fiz3 = _mm_hadd_ps(fiz3, fiy4); \
 744         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
 745         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
 746         _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
 747         _t5 = _mm_load_ss(fshiftptr+2); \
 748         _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
 749         _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
 750         _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
 751         _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
 752         _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
 753         _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
 754         _t1 = _mm_add_ps(_t1, _t2); \
 755         _t3 = _mm_add_ps(_t3, _t4); \
 756         _t1 = _mm_add_ps(_t1, _t3); \
 757         _t5 = _mm_add_ps(_t5, _t1); \
 758         _mm_store_ss(fshiftptr+2, _t5); \
 759         _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
 760     }
 761 #else
 762 /* Real function for sane compilers */
 763 static gmx_inline void
 764 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 765                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
 766                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
 767                                       __m128 fix4, __m128 fiy4, __m128 fiz4,
 768                                       float * gmx_restrict fptr,
 769                                       float * gmx_restrict fshiftptr)
 770 {
 771     __m128 t1, t2, t3, t4, t5;
 772
 773     fix1 = _mm_hadd_ps(fix1, fiy1);
 774     fiz1 = _mm_hadd_ps(fiz1, fix2);
 775     fiy2 = _mm_hadd_ps(fiy2, fiz2);
 776     fix3 = _mm_hadd_ps(fix3, fiy3);
 777     fiz3 = _mm_hadd_ps(fiz3, fix4);
 778     fiy4 = _mm_hadd_ps(fiy4, fiz4);
 779
 780     fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
 781     fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
 782     fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */
 783
 784     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
 785     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
 786     _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
 787
 788     t5 = _mm_load_ss(fshiftptr+2);
 789     t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
 790
 791     t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2));
 792     t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1));
 793     t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0));
 794     t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
 795     t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
 796
 797     t1 = _mm_add_ps(t1, t2);
 798     t3 = _mm_add_ps(t3, t4);
 799     t1 = _mm_add_ps(t1, t3);
 800     t5 = _mm_add_ps(t5, t1);
 801
 802     _mm_store_ss(fshiftptr+2, t5);
 803     _mm_storeh_pi((__m64 *)(fshiftptr), t5);
 804 }
 805 #endif
 806
 807
 808 static gmx_inline void
 809 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
 810 {
 811     pot1 = _mm_hadd_ps(pot1, pot1);
 812     pot1 = _mm_hadd_ps(pot1, pot1);
 813     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
 814 }
 815
 816 static gmx_inline void
 817 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 818                       __m128 pot2, float * gmx_restrict ptrB)
 819 {
 820     pot1 = _mm_hadd_ps(pot1, pot2);
 821     pot1 = _mm_hadd_ps(pot1, pot1);
 822     pot2 = _mm_permute_ps(pot1, _MM_SHUFFLE(0, 0, 0, 1));
 823     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
 824     _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
 825 }
 826
 827
 828 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */