introduced general 4-wide SIMD support
[alexxy/gromacs.git] / src / mdlib / nbnxn_search_simd_4xn.h
index a7a19579bb85e5f90c6874d6a22e685d4bd719a5..9d5c638c438e036938f30b8ccb6f7e2b91ec0c9b 100644 (file)
@@ -58,46 +58,20 @@ icell_set_x_simd_4xn(int ci,
 
     ia = X_IND_CI_SIMD_4XN(ci);
 
-    x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
-    x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
-    x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
-    x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
-    x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
-    x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
-    x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
-    x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
-    x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
-    x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
-    x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
-    x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+    x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
+    x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
+    x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
+    x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+    x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+    x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+    x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+    x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+    x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+    x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+    x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+    x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 }
 
-#ifndef GMX_SIMD_HAVE_ANYTRUE
-/* Fallback function in case gmx_anytrue_pr is not present */
-static gmx_inline gmx_bool
-gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
-{
-    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
-    gmx_bool any;
-    int      s;
-
-    bools = gmx_simd_align_real(bools_array);
-
-    gmx_store_pb(bools, bool_S);
-
-    any = FALSE;
-    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
-    {
-        if (GMX_SIMD_IS_TRUE(bools[s]))
-        {
-            any = TRUE;
-        }
-    }
-
-    return any;
-}
-#endif
-
 /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
@@ -115,25 +89,25 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
     const nbnxn_x_ci_simd_4xn_t *work;
     const nbnxn_bb_t            *bb_ci;
 
-    gmx_mm_pr                    jx_SSE, jy_SSE, jz_SSE;
+    gmx_mm_pr                    jx_S, jy_S, jz_S;
 
-    gmx_mm_pr                    dx_SSE0, dy_SSE0, dz_SSE0;
-    gmx_mm_pr                    dx_SSE1, dy_SSE1, dz_SSE1;
-    gmx_mm_pr                    dx_SSE2, dy_SSE2, dz_SSE2;
-    gmx_mm_pr                    dx_SSE3, dy_SSE3, dz_SSE3;
+    gmx_mm_pr                    dx_S0, dy_S0, dz_S0;
+    gmx_mm_pr                    dx_S1, dy_S1, dz_S1;
+    gmx_mm_pr                    dx_S2, dy_S2, dz_S2;
+    gmx_mm_pr                    dx_S3, dy_S3, dz_S3;
 
-    gmx_mm_pr                    rsq_SSE0;
-    gmx_mm_pr                    rsq_SSE1;
-    gmx_mm_pr                    rsq_SSE2;
-    gmx_mm_pr                    rsq_SSE3;
+    gmx_mm_pr                    rsq_S0;
+    gmx_mm_pr                    rsq_S1;
+    gmx_mm_pr                    rsq_S2;
+    gmx_mm_pr                    rsq_S3;
 
-    gmx_mm_pb                    wco_SSE0;
-    gmx_mm_pb                    wco_SSE1;
-    gmx_mm_pb                    wco_SSE2;
-    gmx_mm_pb                    wco_SSE3;
-    gmx_mm_pb                    wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+    gmx_mm_pb                    wco_S0;
+    gmx_mm_pb                    wco_S1;
+    gmx_mm_pb                    wco_S2;
+    gmx_mm_pb                    wco_S3;
+    gmx_mm_pb                    wco_any_S01, wco_any_S23, wco_any_S;
 
-    gmx_mm_pr                    rc2_SSE;
+    gmx_mm_pr                    rc2_S;
 
     gmx_bool                     InRange;
     float                        d2;
@@ -146,13 +120,13 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
 
     bb_ci = nbl->work->bb_ci;
 
-    rc2_SSE   = gmx_set1_pr(rl2);
+    rc2_S   = gmx_set1_pr(rl2);
 
     InRange = FALSE;
     while (!InRange && cjf <= cjl)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
 #endif
@@ -171,45 +145,41 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
         {
             xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
 
-            jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
-            jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
-            jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+            jx_S  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+            jy_S  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+            jz_S  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
 
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
-            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
-            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
-            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
-            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
-            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
+            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
+            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
+            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
+            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
-            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
-            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
-            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
-            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
+            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
+            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }
@@ -226,8 +196,8 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
     InRange = FALSE;
     while (!InRange && cjl > cjf)
     {
-#ifdef NBNXN_SEARCH_BB_SSE
-        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SIMD4
+        d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj);
 #else
         d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
 #endif
@@ -246,44 +216,40 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
         {
             xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
 
-            jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
-            jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
-            jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+            jx_S  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+            jy_S  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+            jz_S  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
 
             /* Calculate distance */
-            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
-            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
-            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
-            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
-            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
-            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
-            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
-            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
-            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
-            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
-            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
-            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
+            dx_S0            = gmx_sub_pr(work->ix_S0, jx_S);
+            dy_S0            = gmx_sub_pr(work->iy_S0, jy_S);
+            dz_S0            = gmx_sub_pr(work->iz_S0, jz_S);
+            dx_S1            = gmx_sub_pr(work->ix_S1, jx_S);
+            dy_S1            = gmx_sub_pr(work->iy_S1, jy_S);
+            dz_S1            = gmx_sub_pr(work->iz_S1, jz_S);
+            dx_S2            = gmx_sub_pr(work->ix_S2, jx_S);
+            dy_S2            = gmx_sub_pr(work->iy_S2, jy_S);
+            dz_S2            = gmx_sub_pr(work->iz_S2, jz_S);
+            dx_S3            = gmx_sub_pr(work->ix_S3, jx_S);
+            dy_S3            = gmx_sub_pr(work->iy_S3, jy_S);
+            dz_S3            = gmx_sub_pr(work->iz_S3, jz_S);
 
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
-            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
-            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
-            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
-
-            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
-            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
-            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-
-#ifdef GMX_SIMD_HAVE_ANYTRUE
-            InRange            = gmx_anytrue_pb(wco_any_SSE);
-#else
-            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
-#endif
+            rsq_S0           = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+            rsq_S1           = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+            rsq_S2           = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+            rsq_S3           = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
+
+            wco_S0           = gmx_cmplt_pr(rsq_S0, rc2_S);
+            wco_S1           = gmx_cmplt_pr(rsq_S1, rc2_S);
+            wco_S2           = gmx_cmplt_pr(rsq_S2, rc2_S);
+            wco_S3           = gmx_cmplt_pr(rsq_S3, rc2_S);
+
+            wco_any_S01      = gmx_or_pb(wco_S0, wco_S1);
+            wco_any_S23      = gmx_or_pb(wco_S2, wco_S3);
+            wco_any_S        = gmx_or_pb(wco_any_S01, wco_any_S23);
+
+            InRange          = gmx_anytrue_pb(wco_any_S);
 
             *ndistc += 4*GMX_SIMD_WIDTH_HERE;
         }