#include <xmmintrin.h>
#include <emmintrin.h>
+#include <gmx_sse2_single.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
static inline __m128
my_invrsq_ps(__m128 x)
{
- const __m128 three = (const __m128) {3.0f, 3.0f, 3.0f, 3.0f};
- const __m128 half = (const __m128) {0.5f, 0.5f, 0.5f, 0.5f};
+ const __m128 three = {3.0f, 3.0f, 3.0f, 3.0f};
+ const __m128 half = {0.5f, 0.5f, 0.5f, 0.5f};
__m128 t1 = _mm_rsqrt_ps(x);
- return (__m128) _mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1)))));
+ return gmx_mm_castps_ps128(_mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1))))));
}
void nb_kernel430_sse2_single(int * p_nri,
mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
}
- jx = _mm_and_ps( (__m128) mask, xmm6);
- jy = _mm_and_ps( (__m128) mask, xmm4);
- jz = _mm_and_ps( (__m128) mask, xmm5);
+ jx = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
+ jy = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
+ jz = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
- c6 = _mm_and_ps( (__m128) mask, c6);
- c12 = _mm_and_ps( (__m128) mask, c12);
- dvdaj = _mm_and_ps( (__m128) mask, dvdaj);
- isaj = _mm_and_ps( (__m128) mask, isaj);
- q = _mm_and_ps( (__m128) mask, q);
+ c6 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
+ c12 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
+ dvdaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
+ isaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);
+ q = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
dx1 = _mm_sub_ps(ix,jx);
dy1 = _mm_sub_ps(iy,jy);
xmm1 = _mm_mul_ps(xmm1,isaj);
dvdaj = _mm_add_ps(dvdaj,xmm1);
- vcoul = _mm_and_ps( (__m128) mask, vcoul);
- vgb = _mm_and_ps( (__m128) mask, vgb);
+ vcoul = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
+ vgb = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
vctot = _mm_add_ps(vctot,vcoul);
vgbtot = _mm_add_ps(vgbtot,vgb);
_mm_store_ss(faction+j33+2,xmm7);
}
- t1 = _mm_and_ps( (__m128) mask, t1);
- t2 = _mm_and_ps( (__m128) mask, t2);
- t3 = _mm_and_ps( (__m128) mask, t3);
+ t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
+ t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
+ t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
fix = _mm_add_ps(fix,t1);
fiy = _mm_add_ps(fiy,t2);
xmm2 = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
xmm2 = _mm_movelh_ps(xmm2,fiz);
- xmm2 = _mm_and_ps( (__m128) maski, xmm2);
+ xmm2 = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
/* load i force from memory */
xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));