src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 2011-2012, The GROMACS Development Team
   7  *
   8  * Gromacs is a library for molecular simulation and trajectory analysis,
   9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  10  * a full list of developers and information, check out http://www.gromacs.org
  11  *
  12  * This program is free software; you can redistribute it and/or modify it under
  13  * the terms of the GNU Lesser General Public License as published by the Free
  14  * Software Foundation; either version 2 of the License, or (at your option) any
  15  * later version.
  16  * As a special exception, you may use this file as part of a free software
  17  * library without restriction.  Specifically, if other files instantiate
  18  * templates or use macros or inline functions from this file, or you compile
  19  * this file and link it with other files to produce an executable, this
  20  * file does not by itself cause the resulting executable to be covered by
  21  * the GNU Lesser General Public License.
  22  *
  23  * In plain-speak: do not worry about classes/macros/templates either - only
  24  * changes to the library have to be LGPL, not an application linking with it.
  25  *
  26  * To help fund GROMACS development, we humbly ask that you cite
  27  * the papers people have written on it - you can find them on the website!
  28  */
  29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
  30 #define _kernelutil_x86_avx_128_fma_single_h_
  31
  32
  33 #include <math.h>
  34
  35 #include "gmx_x86_avx_128_fma.h"
  36
  37 /* Normal sum of four xmm registers */
  38 #define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
  39
  40 static gmx_inline int
  41 gmx_mm_any_lt(__m128 a, __m128 b)
  42 {
  43     return _mm_movemask_ps(_mm_cmplt_ps(a,b));
  44 }
  45
  46 static gmx_inline __m128
  47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  48 {
  49     return _mm_macc_ps(dx,dx,_mm_macc_ps(dy,dy,_mm_mul_ps(dz,dz)));
  50 }
  51
  52 /* Load a single value from 1-4 places, merge into xmm register */
  53
  54 static gmx_inline __m128
  55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
  56                              const float * gmx_restrict ptrB,
  57                              const float * gmx_restrict ptrC,
  58                              const float * gmx_restrict ptrD)
  59 {
  60     __m128 t1,t2;
  61
  62     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
  63     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
  64     return _mm_unpacklo_ps(t1,t2);
  65 }
  66
  67
  68 static gmx_inline void
  69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
  70                               float * gmx_restrict ptrB,
  71                               float * gmx_restrict ptrC,
  72                               float * gmx_restrict ptrD, __m128 xmm1)
  73 {
  74     __m128 t2,t3,t4;
  75
  76     t2       = _mm_permute_ps(xmm1,_MM_SHUFFLE(1,1,1,1));
  77     t3       = _mm_permute_ps(xmm1,_MM_SHUFFLE(2,2,2,2));
  78     t4       = _mm_permute_ps(xmm1,_MM_SHUFFLE(3,3,3,3));
  79     _mm_store_ss(ptrA,xmm1);
  80     _mm_store_ss(ptrB,t2);
  81     _mm_store_ss(ptrC,t3);
  82     _mm_store_ss(ptrD,t4);
  83 }
  84
  85
  86 static gmx_inline void
  87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
  88                                   float * gmx_restrict ptrB,
  89                                   float * gmx_restrict ptrC,
  90                                   float * gmx_restrict ptrD, __m128 xmm1)
  91 {
  92     __m128 tmp;
  93
  94     tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
  95     tmp = _mm_add_ps(tmp,xmm1);
  96     gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
  97 }
  98
  99
 100 static gmx_inline void
 101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 102                              const float * gmx_restrict p2,
 103                              const float * gmx_restrict p3,
 104                              const float * gmx_restrict p4,
 105                              __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
 106 {
 107     __m128 t1,t2,t3,t4;
 108     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);
 109     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);
 110     t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);
 111     t4   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);
 112     t1   = _mm_unpacklo_ps(t1,t3);
 113     t2   = _mm_unpacklo_ps(t2,t4);
 114     *c6  = _mm_unpacklo_ps(t1,t2);
 115     *c12 = _mm_unpackhi_ps(t1,t2);
 116 }
 117
 118
 119
 120
 121 static gmx_inline void
 122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 123         const float * gmx_restrict xyz,
 124         __m128 * gmx_restrict x1,
 125         __m128 * gmx_restrict y1,
 126         __m128 * gmx_restrict z1)
 127 {
 128     __m128 t1,t2,t3,t4;
 129
 130     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 131     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
 132     t3   = _mm_load_ss(xyz_shift+2);
 133     t4   = _mm_load_ss(xyz+2);
 134     t1   = _mm_add_ps(t1,t2);
 135     t3   = _mm_add_ss(t3,t4);
 136
 137     *x1  = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 138     *y1  = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 139     *z1  = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 140 }
 141
 142
 143 static gmx_inline void
 144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 145         const float * gmx_restrict xyz,
 146         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 147         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 148         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 149 {
 150     __m128 tA,tB;
 151     __m128 t1,t2,t3,t4,t5,t6;
 152
 153     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 154     tB   = _mm_load_ss(xyz_shift+2);
 155
 156     t1   = _mm_loadu_ps(xyz);
 157     t2   = _mm_loadu_ps(xyz+4);
 158     t3   = _mm_load_ss(xyz+8);
 159
 160     tA   = _mm_movelh_ps(tA,tB);
 161     t4   = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
 162     t5   = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
 163     t6   = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
 164
 165     t1   = _mm_add_ps(t1,t4);
 166     t2   = _mm_add_ps(t2,t5);
 167     t3   = _mm_add_ss(t3,t6);
 168
 169     *x1  = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 170     *y1  = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 171     *z1  = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
 172     *x2  = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
 173     *y2  = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
 174     *z2  = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
 175     *x3  = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
 176     *y3  = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
 177     *z3  = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 178 }
 179
 180
 181 static gmx_inline void
 182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 183         const float * gmx_restrict xyz,
 184         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 185         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 186         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
 187         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 188 {
 189     __m128 tA,tB;
 190     __m128 t1,t2,t3,t4,t5,t6;
 191
 192     tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
 193     tB   = _mm_load_ss(xyz_shift+2);
 194
 195     t1   = _mm_loadu_ps(xyz);
 196     t2   = _mm_loadu_ps(xyz+4);
 197     t3   = _mm_loadu_ps(xyz+8);
 198
 199     tA   = _mm_movelh_ps(tA,tB);
 200     t4   = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
 201     t5   = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
 202     t6   = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
 203
 204     t1   = _mm_add_ps(t1,t4);
 205     t2   = _mm_add_ps(t2,t5);
 206     t3   = _mm_add_ps(t3,t6);
 207
 208     *x1  = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
 209     *y1  = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
 210     *z1  = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
 211     *x2  = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
 212     *y2  = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
 213     *z2  = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
 214     *x3  = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
 215     *y3  = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
 216     *z3  = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
 217     *x4  = _mm_permute_ps(t3,_MM_SHUFFLE(1,1,1,1));
 218     *y4  = _mm_permute_ps(t3,_MM_SHUFFLE(2,2,2,2));
 219     *z4  = _mm_permute_ps(t3,_MM_SHUFFLE(3,3,3,3));
 220 }
 221
 222
 223 static gmx_inline void
 224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 225                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 226                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
 227 {
 228     __m128 t1,t2,t3,t4;
 229     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 230     t1             = gmx_mm_maskload_ps(ptrA,mask);
 231     t2             = gmx_mm_maskload_ps(ptrB,mask);
 232     t3             = gmx_mm_maskload_ps(ptrC,mask);
 233     t4             = gmx_mm_maskload_ps(ptrD,mask);
 234     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 235     *x1           = t1;
 236     *y1           = t2;
 237     *z1           = t3;
 238 }
 239
 240
 241 static gmx_inline void
 242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 243                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 244                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 245                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 246                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
 247 {
 248     __m128 t1,t2,t3,t4;
 249     t1            = _mm_loadu_ps(ptrA);
 250     t2            = _mm_loadu_ps(ptrB);
 251     t3            = _mm_loadu_ps(ptrC);
 252     t4            = _mm_loadu_ps(ptrD);
 253     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 254     *x1           = t1;
 255     *y1           = t2;
 256     *z1           = t3;
 257     *x2           = t4;
 258     t1            = _mm_loadu_ps(ptrA+4);
 259     t2            = _mm_loadu_ps(ptrB+4);
 260     t3            = _mm_loadu_ps(ptrC+4);
 261     t4            = _mm_loadu_ps(ptrD+4);
 262     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 263     *y2           = t1;
 264     *z2           = t2;
 265     *x3           = t3;
 266     *y3           = t4;
 267     t1            = _mm_load_ss(ptrA+8);
 268     t2            = _mm_load_ss(ptrB+8);
 269     t3            = _mm_load_ss(ptrC+8);
 270     t4            = _mm_load_ss(ptrD+8);
 271     t1            = _mm_unpacklo_ps(t1,t3);
 272     t3            = _mm_unpacklo_ps(t2,t4);
 273     *z3           = _mm_unpacklo_ps(t1,t3);
 274 }
 275
 276
 277 static gmx_inline void
 278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
 279                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
 280                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
 281                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
 282                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
 283                                   __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
 284 {
 285     __m128 t1,t2,t3,t4;
 286     t1            = _mm_loadu_ps(ptrA);
 287     t2            = _mm_loadu_ps(ptrB);
 288     t3            = _mm_loadu_ps(ptrC);
 289     t4            = _mm_loadu_ps(ptrD);
 290     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 291     *x1           = t1;
 292     *y1           = t2;
 293     *z1           = t3;
 294     *x2           = t4;
 295     t1            = _mm_loadu_ps(ptrA+4);
 296     t2            = _mm_loadu_ps(ptrB+4);
 297     t3            = _mm_loadu_ps(ptrC+4);
 298     t4            = _mm_loadu_ps(ptrD+4);
 299     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 300     *y2           = t1;
 301     *z2           = t2;
 302     *x3           = t3;
 303     *y3           = t4;
 304     t1            = _mm_loadu_ps(ptrA+8);
 305     t2            = _mm_loadu_ps(ptrB+8);
 306     t3            = _mm_loadu_ps(ptrC+8);
 307     t4            = _mm_loadu_ps(ptrD+8);
 308     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
 309     *z3           = t1;
 310     *x4           = t2;
 311     *y4           = t3;
 312     *z4           = t4;
 313 }
 314
 315
 316 static gmx_inline void
 317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 318                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 319                                        __m128 x1, __m128 y1, __m128 z1)
 320 {
 321     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
 322     t5          = _mm_unpacklo_ps(y1,z1);
 323     t6          = _mm_unpackhi_ps(y1,z1);
 324     t7          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
 325     t8          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
 326     t9          = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
 327     t10         = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
 328     t1          = _mm_load_ss(ptrA);
 329     t1          = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
 330     t1          = _mm_sub_ps(t1,t7);
 331     _mm_store_ss(ptrA,t1);
 332     _mm_storeh_pi((__m64 *)(ptrA+1),t1);
 333     t2          = _mm_load_ss(ptrB);
 334     t2          = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
 335     t2          = _mm_sub_ps(t2,t8);
 336     _mm_store_ss(ptrB,t2);
 337     _mm_storeh_pi((__m64 *)(ptrB+1),t2);
 338     t3          = _mm_load_ss(ptrC);
 339     t3          = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
 340     t3          = _mm_sub_ps(t3,t9);
 341     _mm_store_ss(ptrC,t3);
 342     _mm_storeh_pi((__m64 *)(ptrC+1),t3);
 343     t4          = _mm_load_ss(ptrD);
 344     t4          = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
 345     t4          = _mm_sub_ps(t4,t10);
 346     _mm_store_ss(ptrD,t4);
 347     _mm_storeh_pi((__m64 *)(ptrD+1),t4);
 348 }
 349
 350
 351 #if defined (_MSC_VER) && defined(_M_IX86)
 352 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 353 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
 354                                                _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
 355 {\
 356     __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
 357     __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
 358     __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
 359     _t13         = _mm_unpackhi_ps(_x1,_y1);\
 360     _x1          = _mm_unpacklo_ps(_x1,_y1);\
 361     _t14         = _mm_unpackhi_ps(_z1,_x2);\
 362     _z1          = _mm_unpacklo_ps(_z1,_x2);\
 363     _t15         = _mm_unpackhi_ps(_y2,_z2);\
 364     _y2          = _mm_unpacklo_ps(_y2,_z2);\
 365     _t16         = _mm_unpackhi_ps(_x3,_y3);\
 366     _x3          = _mm_unpacklo_ps(_x3,_y3);\
 367     _t17         = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
 368     _t18         = _mm_movehl_ps(_z3,_z3);\
 369     _t19         = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
 370     _t20         = _mm_movelh_ps(_x1,_z1);\
 371     _t21         = _mm_movehl_ps(_z1,_x1);\
 372     _t22         = _mm_movelh_ps(_t13,_t14);\
 373     _t14         = _mm_movehl_ps(_t14,_t13);\
 374     _t23         = _mm_movelh_ps(_y2,_x3);\
 375     _t24         = _mm_movehl_ps(_x3,_y2);\
 376     _t25         = _mm_movelh_ps(_t15,_t16);\
 377     _t16         = _mm_movehl_ps(_t16,_t15);\
 378     _t1          = _mm_loadu_ps(ptrA);\
 379     _t2          = _mm_loadu_ps(ptrA+4);\
 380     _t3          = _mm_load_ss(ptrA+8);\
 381     _t1          = _mm_sub_ps(_t1,_t20);\
 382     _t2          = _mm_sub_ps(_t2,_t23);\
 383     _t3          = _mm_sub_ss(_t3,_z3);\
 384     _mm_storeu_ps(ptrA,_t1);\
 385     _mm_storeu_ps(ptrA+4,_t2);\
 386     _mm_store_ss(ptrA+8,_t3);\
 387     _t4          = _mm_loadu_ps(ptrB);\
 388     _t5          = _mm_loadu_ps(ptrB+4);\
 389     _t6          = _mm_load_ss(ptrB+8);\
 390     _t4          = _mm_sub_ps(_t4,_t21);\
 391     _t5          = _mm_sub_ps(_t5,_t24);\
 392     _t6          = _mm_sub_ss(_t6,_t17);\
 393     _mm_storeu_ps(ptrB,_t4);\
 394     _mm_storeu_ps(ptrB+4,_t5);\
 395     _mm_store_ss(ptrB+8,_t6);\
 396     _t7          = _mm_loadu_ps(ptrC);\
 397     _t8          = _mm_loadu_ps(ptrC+4);\
 398     _t9          = _mm_load_ss(ptrC+8);\
 399     _t7          = _mm_sub_ps(_t7,_t22);\
 400     _t8          = _mm_sub_ps(_t8,_t25);\
 401     _t9          = _mm_sub_ss(_t9,_t18);\
 402     _mm_storeu_ps(ptrC,_t7);\
 403     _mm_storeu_ps(ptrC+4,_t8);\
 404     _mm_store_ss(ptrC+8,_t9);\
 405     _t10         = _mm_loadu_ps(ptrD);\
 406     _t11         = _mm_loadu_ps(ptrD+4);\
 407     _t12         = _mm_load_ss(ptrD+8);\
 408     _t10         = _mm_sub_ps(_t10,_t14);\
 409     _t11         = _mm_sub_ps(_t11,_t16);\
 410     _t12         = _mm_sub_ss(_t12,_t19);\
 411     _mm_storeu_ps(ptrD,_t10);\
 412     _mm_storeu_ps(ptrD+4,_t11);\
 413     _mm_store_ss(ptrD+8,_t12);\
 414 }
 415 #else
 416 /* Real function for sane compilers */
 417 static gmx_inline void
 418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 419                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 420                                        __m128 x1, __m128 y1, __m128 z1,
 421                                        __m128 x2, __m128 y2, __m128 z2,
 422                                        __m128 x3, __m128 y3, __m128 z3)
 423 {
 424     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
 425     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
 426     __m128 t20,t21,t22,t23,t24,t25;
 427     t13         = _mm_unpackhi_ps(x1,y1);
 428     x1          = _mm_unpacklo_ps(x1,y1);
 429     t14         = _mm_unpackhi_ps(z1,x2);
 430     z1          = _mm_unpacklo_ps(z1,x2);
 431     t15         = _mm_unpackhi_ps(y2,z2);
 432     y2          = _mm_unpacklo_ps(y2,z2);
 433     t16         = _mm_unpackhi_ps(x3,y3);
 434     x3          = _mm_unpacklo_ps(x3,y3);
 435     t17         = _mm_permute_ps(z3,_MM_SHUFFLE(0,0,0,1));
 436     t18         = _mm_movehl_ps(z3,z3);
 437     t19         = _mm_permute_ps(t18,_MM_SHUFFLE(0,0,0,1));
 438     t20         = _mm_movelh_ps(x1,z1);
 439     t21         = _mm_movehl_ps(z1,x1);
 440     t22         = _mm_movelh_ps(t13,t14);
 441     t14         = _mm_movehl_ps(t14,t13);
 442     t23         = _mm_movelh_ps(y2,x3);
 443     t24         = _mm_movehl_ps(x3,y2);
 444     t25         = _mm_movelh_ps(t15,t16);
 445     t16         = _mm_movehl_ps(t16,t15);
 446     t1          = _mm_loadu_ps(ptrA);
 447     t2          = _mm_loadu_ps(ptrA+4);
 448     t3          = _mm_load_ss(ptrA+8);
 449     t1          = _mm_sub_ps(t1,t20);
 450     t2          = _mm_sub_ps(t2,t23);
 451     t3          = _mm_sub_ss(t3,z3);
 452     _mm_storeu_ps(ptrA,t1);
 453     _mm_storeu_ps(ptrA+4,t2);
 454     _mm_store_ss(ptrA+8,t3);
 455     t4          = _mm_loadu_ps(ptrB);
 456     t5          = _mm_loadu_ps(ptrB+4);
 457     t6          = _mm_load_ss(ptrB+8);
 458     t4          = _mm_sub_ps(t4,t21);
 459     t5          = _mm_sub_ps(t5,t24);
 460     t6          = _mm_sub_ss(t6,t17);
 461     _mm_storeu_ps(ptrB,t4);
 462     _mm_storeu_ps(ptrB+4,t5);
 463     _mm_store_ss(ptrB+8,t6);
 464     t7          = _mm_loadu_ps(ptrC);
 465     t8          = _mm_loadu_ps(ptrC+4);
 466     t9          = _mm_load_ss(ptrC+8);
 467     t7          = _mm_sub_ps(t7,t22);
 468     t8          = _mm_sub_ps(t8,t25);
 469     t9          = _mm_sub_ss(t9,t18);
 470     _mm_storeu_ps(ptrC,t7);
 471     _mm_storeu_ps(ptrC+4,t8);
 472     _mm_store_ss(ptrC+8,t9);
 473     t10         = _mm_loadu_ps(ptrD);
 474     t11         = _mm_loadu_ps(ptrD+4);
 475     t12         = _mm_load_ss(ptrD+8);
 476     t10         = _mm_sub_ps(t10,t14);
 477     t11         = _mm_sub_ps(t11,t16);
 478     t12         = _mm_sub_ss(t12,t19);
 479     _mm_storeu_ps(ptrD,t10);
 480     _mm_storeu_ps(ptrD+4,t11);
 481     _mm_store_ss(ptrD+8,t12);
 482 }
 483 #endif
 484
 485 #if defined (_MSC_VER) && defined(_M_IX86)
 486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 487 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
 488                                                _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
 489 {\
 490     __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
 491     __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
 492     __m128 _t23,_t24;\
 493     _t13         = _mm_unpackhi_ps(_x1,_y1);\
 494     _x1          = _mm_unpacklo_ps(_x1,_y1);\
 495     _t14         = _mm_unpackhi_ps(_z1,_x2);\
 496     _z1          = _mm_unpacklo_ps(_z1,_x2);\
 497     _t15         = _mm_unpackhi_ps(_y2,_z2);\
 498     _y2          = _mm_unpacklo_ps(_y2,_z2);\
 499     _t16         = _mm_unpackhi_ps(_x3,_y3);\
 500     _x3          = _mm_unpacklo_ps(_x3,_y3);\
 501     _t17         = _mm_unpackhi_ps(_z3,_x4);\
 502     _z3          = _mm_unpacklo_ps(_z3,_x4);\
 503     _t18         = _mm_unpackhi_ps(_y4,_z4);\
 504     _y4          = _mm_unpacklo_ps(_y4,_z4);\
 505     _t19         = _mm_movelh_ps(_x1,_z1);\
 506     _z1          = _mm_movehl_ps(_z1,_x1);\
 507     _t20         = _mm_movelh_ps(_t13,_t14);\
 508     _t14         = _mm_movehl_ps(_t14,_t13);\
 509     _t21         = _mm_movelh_ps(_y2,_x3);\
 510     _x3          = _mm_movehl_ps(_x3,_y2);\
 511     _t22         = _mm_movelh_ps(_t15,_t16);\
 512     _t16         = _mm_movehl_ps(_t16,_t15);\
 513     _t23         = _mm_movelh_ps(_z3,_y4);\
 514     _y4          = _mm_movehl_ps(_y4,_z3);\
 515     _t24         = _mm_movelh_ps(_t17,_t18);\
 516     _t18         = _mm_movehl_ps(_t18,_t17);\
 517     _t1          = _mm_loadu_ps(ptrA);\
 518     _t2          = _mm_loadu_ps(ptrA+4);\
 519     _t3          = _mm_loadu_ps(ptrA+8);\
 520     _t1          = _mm_sub_ps(_t1,_t19);\
 521     _t2          = _mm_sub_ps(_t2,_t21);\
 522     _t3          = _mm_sub_ps(_t3,_t23);\
 523     _mm_storeu_ps(ptrA,_t1);\
 524     _mm_storeu_ps(ptrA+4,_t2);\
 525     _mm_storeu_ps(ptrA+8,_t3);\
 526     _t4          = _mm_loadu_ps(ptrB);\
 527     _t5          = _mm_loadu_ps(ptrB+4);\
 528     _t6          = _mm_loadu_ps(ptrB+8);\
 529     _t4          = _mm_sub_ps(_t4,_z1);\
 530     _t5          = _mm_sub_ps(_t5,_x3);\
 531     _t6          = _mm_sub_ps(_t6,_y4);\
 532     _mm_storeu_ps(ptrB,_t4);\
 533     _mm_storeu_ps(ptrB+4,_t5);\
 534     _mm_storeu_ps(ptrB+8,_t6);\
 535     _t7          = _mm_loadu_ps(ptrC);\
 536     _t8          = _mm_loadu_ps(ptrC+4);\
 537     _t9          = _mm_loadu_ps(ptrC+8);\
 538     _t7          = _mm_sub_ps(_t7,_t20);\
 539     _t8          = _mm_sub_ps(_t8,_t22);\
 540     _t9          = _mm_sub_ps(_t9,_t24);\
 541     _mm_storeu_ps(ptrC,_t7);\
 542     _mm_storeu_ps(ptrC+4,_t8);\
 543     _mm_storeu_ps(ptrC+8,_t9);\
 544     _t10         = _mm_loadu_ps(ptrD);\
 545     _t11         = _mm_loadu_ps(ptrD+4);\
 546     _t12         = _mm_loadu_ps(ptrD+8);\
 547     _t10         = _mm_sub_ps(_t10,_t14);\
 548     _t11         = _mm_sub_ps(_t11,_t16);\
 549     _t12         = _mm_sub_ps(_t12,_t18);\
 550     _mm_storeu_ps(ptrD,_t10);\
 551     _mm_storeu_ps(ptrD+4,_t11);\
 552     _mm_storeu_ps(ptrD+8,_t12);\
 553 }
 554 #else
 555 /* Real function for sane compilers */
 556 static gmx_inline void
 557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
 558                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
 559                                        __m128 x1, __m128 y1, __m128 z1,
 560                                        __m128 x2, __m128 y2, __m128 z2,
 561                                        __m128 x3, __m128 y3, __m128 z3,
 562                                        __m128 x4, __m128 y4, __m128 z4)
 563 {
 564     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
 565     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
 566     __m128 t23,t24;
 567     t13         = _mm_unpackhi_ps(x1,y1);
 568     x1          = _mm_unpacklo_ps(x1,y1);
 569     t14         = _mm_unpackhi_ps(z1,x2);
 570     z1          = _mm_unpacklo_ps(z1,x2);
 571     t15         = _mm_unpackhi_ps(y2,z2);
 572     y2          = _mm_unpacklo_ps(y2,z2);
 573     t16         = _mm_unpackhi_ps(x3,y3);
 574     x3          = _mm_unpacklo_ps(x3,y3);
 575     t17         = _mm_unpackhi_ps(z3,x4);
 576     z3          = _mm_unpacklo_ps(z3,x4);
 577     t18         = _mm_unpackhi_ps(y4,z4);
 578     y4          = _mm_unpacklo_ps(y4,z4);
 579     t19         = _mm_movelh_ps(x1,z1);
 580     z1          = _mm_movehl_ps(z1,x1);
 581     t20         = _mm_movelh_ps(t13,t14);
 582     t14         = _mm_movehl_ps(t14,t13);
 583     t21         = _mm_movelh_ps(y2,x3);
 584     x3          = _mm_movehl_ps(x3,y2);
 585     t22         = _mm_movelh_ps(t15,t16);
 586     t16         = _mm_movehl_ps(t16,t15);
 587     t23         = _mm_movelh_ps(z3,y4);
 588     y4          = _mm_movehl_ps(y4,z3);
 589     t24         = _mm_movelh_ps(t17,t18);
 590     t18         = _mm_movehl_ps(t18,t17);
 591     t1          = _mm_loadu_ps(ptrA);
 592     t2          = _mm_loadu_ps(ptrA+4);
 593     t3          = _mm_loadu_ps(ptrA+8);
 594     t1          = _mm_sub_ps(t1,t19);
 595     t2          = _mm_sub_ps(t2,t21);
 596     t3          = _mm_sub_ps(t3,t23);
 597     _mm_storeu_ps(ptrA,t1);
 598     _mm_storeu_ps(ptrA+4,t2);
 599     _mm_storeu_ps(ptrA+8,t3);
 600     t4          = _mm_loadu_ps(ptrB);
 601     t5          = _mm_loadu_ps(ptrB+4);
 602     t6          = _mm_loadu_ps(ptrB+8);
 603     t4          = _mm_sub_ps(t4,z1);
 604     t5          = _mm_sub_ps(t5,x3);
 605     t6          = _mm_sub_ps(t6,y4);
 606     _mm_storeu_ps(ptrB,t4);
 607     _mm_storeu_ps(ptrB+4,t5);
 608     _mm_storeu_ps(ptrB+8,t6);
 609     t7          = _mm_loadu_ps(ptrC);
 610     t8          = _mm_loadu_ps(ptrC+4);
 611     t9          = _mm_loadu_ps(ptrC+8);
 612     t7          = _mm_sub_ps(t7,t20);
 613     t8          = _mm_sub_ps(t8,t22);
 614     t9          = _mm_sub_ps(t9,t24);
 615     _mm_storeu_ps(ptrC,t7);
 616     _mm_storeu_ps(ptrC+4,t8);
 617     _mm_storeu_ps(ptrC+8,t9);
 618     t10         = _mm_loadu_ps(ptrD);
 619     t11         = _mm_loadu_ps(ptrD+4);
 620     t12         = _mm_loadu_ps(ptrD+8);
 621     t10         = _mm_sub_ps(t10,t14);
 622     t11         = _mm_sub_ps(t11,t16);
 623     t12         = _mm_sub_ps(t12,t18);
 624     _mm_storeu_ps(ptrD,t10);
 625     _mm_storeu_ps(ptrD+4,t11);
 626     _mm_storeu_ps(ptrD+8,t12);
 627 }
 628 #endif
 629
 630 static gmx_inline void
 631 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 632                                       float * gmx_restrict fptr,
 633                                       float * gmx_restrict fshiftptr)
 634 {
 635     __m128 t2,t3;
 636
 637     fix1 = _mm_hadd_ps(fix1,fix1);
 638     fiy1 = _mm_hadd_ps(fiy1,fiz1);
 639
 640     fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
 641
 642     t2 = _mm_load_ss(fptr);
 643     t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
 644     t3 = _mm_load_ss(fshiftptr);
 645     t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
 646
 647     t2 = _mm_add_ps(t2,fix1);
 648     t3 = _mm_add_ps(t3,fix1);
 649
 650     _mm_store_ss(fptr,t2);
 651     _mm_storeh_pi((__m64 *)(fptr+1),t2);
 652     _mm_store_ss(fshiftptr,t3);
 653     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 654 }
 655
 656 #if defined (_MSC_VER) && defined(_M_IX86)
 657 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 658 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
 659                                               fptr,fshiftptr) \
 660 {\
 661     __m128 _t1,_t2,_t3,_t4;\
 662 \
 663     fix1 = _mm_hadd_ps(fix1,fiy1);\
 664     fiz1 = _mm_hadd_ps(fiz1,fix2);\
 665     fiy2 = _mm_hadd_ps(fiy2,fiz2);\
 666     fix3 = _mm_hadd_ps(fix3,fiy3);\
 667     fiz3 = _mm_hadd_ps(fiz3,fiz3);\
 668     fix1 = _mm_hadd_ps(fix1,fiz1);\
 669     fiy2 = _mm_hadd_ps(fiy2,fix3);\
 670     fiz3 = _mm_hadd_ps(fiz3,fiz3);\
 671     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
 672     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
 673     _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
 674     _t4 = _mm_load_ss(fshiftptr+2);\
 675     _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
 676     _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
 677     _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
 678     _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
 679     _t3 = _mm_permute_ps(_t3  ,_MM_SHUFFLE(1,2,0,0));\
 680     _t1 = _mm_add_ps(_t1,_t2);\
 681     _t3 = _mm_add_ps(_t3,_t4);\
 682     _t1 = _mm_add_ps(_t1,_t3);\
 683     _mm_store_ss(fshiftptr+2,_t1);\
 684     _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
 685 }
 686 #else
 687 /* Real function for sane compilers */
 688 static gmx_inline void
 689 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 690                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
 691                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
 692                                       float * gmx_restrict fptr,
 693                                       float * gmx_restrict fshiftptr)
 694 {
 695     __m128 t1,t2,t3,t4;
 696
 697     fix1 = _mm_hadd_ps(fix1,fiy1);
 698     fiz1 = _mm_hadd_ps(fiz1,fix2);
 699     fiy2 = _mm_hadd_ps(fiy2,fiz2);
 700     fix3 = _mm_hadd_ps(fix3,fiy3);
 701     fiz3 = _mm_hadd_ps(fiz3,fiz3);
 702
 703     fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
 704     fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
 705     fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
 706
 707     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
 708     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
 709     _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
 710
 711     t4 = _mm_load_ss(fshiftptr+2);
 712     t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
 713
 714     t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
 715     t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
 716     t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
 717     t3 = _mm_permute_ps(t3  ,_MM_SHUFFLE(1,2,0,0));        /* fiy2 fix2  -   fiz2 */
 718
 719     t1 = _mm_add_ps(t1,t2);
 720     t3 = _mm_add_ps(t3,t4);
 721     t1 = _mm_add_ps(t1,t3); /* y x - z */
 722
 723     _mm_store_ss(fshiftptr+2,t1);
 724     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
 725 }
 726 #endif
 727
 728 #if defined (_MSC_VER) && defined(_M_IX86)
 729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
 730 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
 731                                               fptr,fshiftptr) \
 732 {\
 733     __m128 _t1,_t2,_t3,_t4,_t5;\
 734 \
 735     fix1 = _mm_hadd_ps(fix1,fiy1);\
 736     fiz1 = _mm_hadd_ps(fiz1,fix2);\
 737     fiy2 = _mm_hadd_ps(fiy2,fiz2);\
 738     fix3 = _mm_hadd_ps(fix3,fiy3);\
 739     fiz3 = _mm_hadd_ps(fiz3,fix4);\
 740     fiy4 = _mm_hadd_ps(fiy4,fiz4);\
 741     fix1 = _mm_hadd_ps(fix1,fiz1);\
 742     fiy2 = _mm_hadd_ps(fiy2,fix3);\
 743     fiz3 = _mm_hadd_ps(fiz3,fiy4);\
 744     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
 745     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
 746     _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
 747     _t5 = _mm_load_ss(fshiftptr+2);\
 748     _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
 749     _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
 750     _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
 751     _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
 752     _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
 753     _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
 754     _t1 = _mm_add_ps(_t1,_t2);\
 755     _t3 = _mm_add_ps(_t3,_t4);\
 756     _t1 = _mm_add_ps(_t1,_t3);\
 757     _t5 = _mm_add_ps(_t5,_t1);\
 758     _mm_store_ss(fshiftptr+2,_t5);\
 759     _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
 760 }
 761 #else
 762 /* Real function for sane compilers */
 763 static gmx_inline void
 764 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
 765                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
 766                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
 767                                       __m128 fix4, __m128 fiy4, __m128 fiz4,
 768                                       float * gmx_restrict fptr,
 769                                       float * gmx_restrict fshiftptr)
 770 {
 771     __m128 t1,t2,t3,t4,t5;
 772
 773     fix1 = _mm_hadd_ps(fix1,fiy1);
 774     fiz1 = _mm_hadd_ps(fiz1,fix2);
 775     fiy2 = _mm_hadd_ps(fiy2,fiz2);
 776     fix3 = _mm_hadd_ps(fix3,fiy3);
 777     fiz3 = _mm_hadd_ps(fiz3,fix4);
 778     fiy4 = _mm_hadd_ps(fiy4,fiz4);
 779
 780     fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
 781     fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
 782     fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
 783
 784     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
 785     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
 786     _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
 787
 788     t5 = _mm_load_ss(fshiftptr+2);
 789     t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
 790
 791     t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));
 792     t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));
 793     t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));
 794     t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
 795     t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
 796
 797     t1 = _mm_add_ps(t1,t2);
 798     t3 = _mm_add_ps(t3,t4);
 799     t1 = _mm_add_ps(t1,t3);
 800     t5 = _mm_add_ps(t5,t1);
 801
 802     _mm_store_ss(fshiftptr+2,t5);
 803     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
 804 }
 805 #endif
 806
 807
 808 static gmx_inline void
 809 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
 810 {
 811     pot1 = _mm_hadd_ps(pot1,pot1);
 812     pot1 = _mm_hadd_ps(pot1,pot1);
 813     _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
 814 }
 815
 816 static gmx_inline void
 817 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
 818                       __m128 pot2, float * gmx_restrict ptrB)
 819 {
 820     pot1 = _mm_hadd_ps(pot1,pot2);
 821     pot1 = _mm_hadd_ps(pot1,pot1);
 822     pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(0,0,0,1));
 823     _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
 824     _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
 825 }
 826
 827
 828 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */