Redesigned SIMD module and unit tests.

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_4xn / nbnxn_kernel_simd_4xn_inner.h
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h

index 758f8cd4f8d7ea5721a43bca75b37623d09d8934..4b7de1678fae416dfdf61dc275fa3f199342375f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -420,10 +420,10 @@
  
  #ifdef CHECK_EXCLS
      /* For excluded pairs add a small number to avoid r^-6 = NaN */
-    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
-    rsq_S1      = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
-    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
-    rsq_S3      = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
+    rsq_S0      = gmx_simd_add_r(rsq_S0, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S0));
+    rsq_S1      = gmx_simd_add_r(rsq_S1, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S1));
+    rsq_S2      = gmx_simd_add_r(rsq_S2, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S2));
+    rsq_S3      = gmx_simd_add_r(rsq_S3, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S3));
  #endif
  
      /* Calculate 1/r */
@@ -433,8 +433,8 @@
      rinv_S2     = gmx_simd_invsqrt_r(rsq_S2);
      rinv_S3     = gmx_simd_invsqrt_r(rsq_S3);
  #else
-    gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
-    gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
+    gmx_simd_invsqrt_pair_r(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
+    gmx_simd_invsqrt_pair_r(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
  #endif
  
  #ifdef CALC_COULOMB
@@ -596,12 +596,12 @@
      ti_S1       = gmx_simd_cvtt_r2i(rs_S1);
      ti_S2       = gmx_simd_cvtt_r2i(rs_S2);
      ti_S3       = gmx_simd_cvtt_r2i(rs_S3);
-#ifdef GMX_SIMD_HAVE_FLOOR
-    /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
-    rf_S0       = gmx_simd_floor_r(rs_S0);
-    rf_S1       = gmx_simd_floor_r(rs_S1);
-    rf_S2       = gmx_simd_floor_r(rs_S2);
-    rf_S3       = gmx_simd_floor_r(rs_S3);
+#ifdef GMX_SIMD_HAVE_TRUNC
+    /* SSE4.1 trunc is faster than gmx_cvtepi32_ps int->float cast */
+    rf_S0       = gmx_simd_trunc_r(rs_S0);
+    rf_S1       = gmx_simd_trunc_r(rs_S1);
+    rf_S2       = gmx_simd_trunc_r(rs_S2);
+    rf_S3       = gmx_simd_trunc_r(rs_S3);
  #else
      rf_S0       = gmx_simd_cvt_i2r(ti_S0);
      rf_S1       = gmx_simd_cvt_i2r(ti_S1);