Fixes for Windows/ICC--typecasting
authorPeter Kasson <kasson@Macintosh-175.local>
Sat, 17 Apr 2010 23:52:20 +0000 (16:52 -0700)
committerPeter Kasson <kasson@Macintosh-175.local>
Sat, 17 Apr 2010 23:52:20 +0000 (16:52 -0700)
include/gmx_sse2_single.h
src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.c

index 0f1d1b10147d6eee5806fbd3177d32095defcf14..88dbd95b62350c58002b2bff5495fe29546d5a19 100644 (file)
 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
 #  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
 #  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
+#  define gmx_mm_castps_ps128(a) (a)
 #elif defined(__GNUC__)
 #  define gmx_mm_castsi128_ps(a) ((__m128)(a))
 #  define gmx_mm_castps_si128(a) ((__m128i)(a))
+#  define gmx_mm_castps_ps128(a) ((__m128)(a))
 #else
 static __m128  gmx_mm_castsi128_ps(__m128i a) { return *(__m128 *) &a;  } 
 static __m128i gmx_mm_castps_si128(__m128 a)  { return *(__m128i *) &a; } 
+static __m128  gmx_mm_castps_ps128(__m128 a) { return *(__m128 *) &a;  } 
 #endif
 
 
index 23ee89e755ec62c012379201a9439d6b88e1acfc..511c5a7c1415e7c6265328a35dae4e1c14c07430 100644 (file)
@@ -21,7 +21,7 @@
 
 #include <xmmintrin.h>
 #include <emmintrin.h>
-
+#include <gmx_sse2_single.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
@@ -35,12 +35,12 @@ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
 static inline __m128
 my_invrsq_ps(__m128 x)
 {
-       const __m128 three = (const __m128) {3.0f, 3.0f, 3.0f, 3.0f};
-       const __m128 half  = (const __m128) {0.5f, 0.5f, 0.5f, 0.5f};
+       const __m128 three = {3.0f, 3.0f, 3.0f, 3.0f};
+       const __m128 half  = {0.5f, 0.5f, 0.5f, 0.5f};
        
        __m128 t1 = _mm_rsqrt_ps(x);
        
-       return (__m128) _mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1)))));
+       return gmx_mm_castps_ps128(_mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1))))));
 }
 
 void nb_kernel430_ia32_sse(int *           p_nri,
@@ -624,15 +624,15 @@ void nb_kernel430_ia32_sse(int *           p_nri,
                                mask  = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
                        }
                        
-                       jx      = _mm_and_ps( (__m128) mask, xmm6);
-                       jy      = _mm_and_ps( (__m128) mask, xmm4);
-                       jz      = _mm_and_ps( (__m128) mask, xmm5);
+                       jx      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
+                       jy      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
+                       jz      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
                        
-                       c6      = _mm_and_ps( (__m128) mask, c6);
-                       c12     = _mm_and_ps( (__m128) mask, c12);
-                       dvdaj   = _mm_and_ps( (__m128) mask, dvdaj);
-                       isaj    = _mm_and_ps( (__m128) mask, isaj);                     
-                       q       = _mm_and_ps( (__m128) mask, q);
+                       c6      = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
+                       c12     = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
+                       dvdaj   = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
+                       isaj    = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);                 
+                       q       = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
                        
                        dx1     = _mm_sub_ps(ix,jx);
                        dy1     = _mm_sub_ps(iy,jy);
@@ -710,8 +710,8 @@ void nb_kernel430_ia32_sse(int *           p_nri,
                        xmm1    = _mm_mul_ps(xmm1,isaj);
                        dvdaj   = _mm_add_ps(dvdaj,xmm1);
                        
-                       vcoul   = _mm_and_ps( (__m128) mask, vcoul);
-                       vgb     = _mm_and_ps( (__m128) mask, vgb);
+                       vcoul   = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
+                       vgb     = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
                        
                        vctot   = _mm_add_ps(vctot,vcoul);
                        vgbtot  = _mm_add_ps(vgbtot,vgb);
@@ -897,9 +897,9 @@ void nb_kernel430_ia32_sse(int *           p_nri,
                                _mm_store_ss(faction+j33+2,xmm7); 
                        }
                        
-                       t1 = _mm_and_ps( (__m128) mask, t1);
-                       t2 = _mm_and_ps( (__m128) mask, t2);
-                       t3 = _mm_and_ps( (__m128) mask, t3);
+                       t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
+                       t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
+                       t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
                        
                        fix = _mm_add_ps(fix,t1);
                        fiy = _mm_add_ps(fiy,t2);
@@ -924,7 +924,7 @@ void nb_kernel430_ia32_sse(int *           p_nri,
                
                xmm2    = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
                xmm2    = _mm_movelh_ps(xmm2,fiz); 
-               xmm2    = _mm_and_ps( (__m128) maski, xmm2);
+               xmm2    = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
                
                /* load, add and store i forces */
                xmm4    = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));