Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / kernelutil_x86_sse2_single.h
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h

index 656538c11339ac641b438eed6c3b01194f3aad19..e0b324f582de5960fbdd5e4d624954808e9edde2 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
@@ -109,10 +109,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  {
      __m128 t1,t2,t3,t4;
  
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)p1));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)p2));
-    t3   = _mm_castpd_ps(_mm_load_sd((const double *)p3));
-    t4   = _mm_castpd_ps(_mm_load_sd((const double *)p4));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
+    t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
+    t4   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);   /* - - c12d  c6d */
      t1   = _mm_unpacklo_ps(t1,t2);
      t2   = _mm_unpacklo_ps(t3,t4);
      *c6  = _mm_movelh_ps(t1,t2);
@@ -135,8 +135,8 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  {
      __m128 t1,t2,t3,t4;
      
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)xyz));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
      t3   = _mm_load_ss(xyz_shift+2);
      t4   = _mm_load_ss(xyz+2);
      t1   = _mm_add_ps(t1,t2);
@@ -158,7 +158,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
      
-    tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+    tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
      tB   = _mm_load_ss(xyz_shift+2);
      
      t1   = _mm_loadu_ps(xyz);
@@ -543,44 +543,6 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
  }
  
-static gmx_inline void
-gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
-                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
-{
-    __m128 t1,t2,t4;
-
-    /* transpose data */
-    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
-    t1 = _mm_unpacklo_ps(fiy2,fiz2);
-    t2 = _mm_unpackhi_ps(fiy2,fiz2);
-
-    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
-    t1   = _mm_add_ps(t1,t2);
-    t2   = _mm_movehl_ps(t2,t1);
-    fiy2 = _mm_add_ps(t1,t2);
-
-    _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-    t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
-    _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
-
-    t4 = _mm_load_ss(fshiftptr+2);
-    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-
-    t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
-    t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
-    t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
-
-    t1 = _mm_add_ps(t1,t2);
-    t1 = _mm_add_ps(t1,t4); /* y x - z */
-
-    _mm_store_ss(fshiftptr+2,t1);
-    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
-
-
-
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,