Fix overflow in LJ-PME nbnxn kernels
authorChristian Wennberg <christian.wennberg@scilifelab.se>
Mon, 28 Jul 2014 16:06:36 +0000 (18:06 +0200)
committerBerk Hess <hess@kth.se>
Mon, 28 Jul 2014 20:26:16 +0000 (22:26 +0200)
The SIMD exp function in the LJ-PME nbnxn kernels could overflow
for pair distances far beyond the cut-off. Added a mask to avoid this.

Fixes #1552

Change-Id: Id87710f3815b341f53a69df0a2990d0bb4edfa74

src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h

index 5663260589ca2f8af2c509f58cbdc615503d8b1e..99385ff3cac17b1fe9e1fc6cb3210b588c870f0f 100644 (file)
@@ -65,7 +65,7 @@
 /* Without exclusions and energies we only need to mask the cut-off,
  * this can be faster with blendv.
  */
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES || defined LJ_EWALD_GEOM) && defined GMX_SIMD_HAVE_BLENDV
 /* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
  * With gcc this is slower, except for RF on Sandy Bridge.
  * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
         tmp = gmx_simd_align_r(tmpa);
         for (i = 0; i < UNROLLI; i += 2)
         {
-            gmx_simd_store_r(tmp, i == 0 ? wco_S0 : wco_S2);
+            gmx_simd_store_r(tmp, gmx_simd_sub_r(rc2_S, i == 0 ? rsq_S0 : rsq_S2));
             for (j = 0; j < 2*UNROLLJ; j++)
             {
-                if (!(tmp[j] == 0))
+                if (tmp[j] >= 0)
                 {
                     npair++;
                 }
 #endif
 #endif
 
-        cr2_S0        = gmx_simd_mul_r(lje_c2_S, rsq_S0);
+        /* Mask for the cut-off to avoid overflow in gmx_simd_exp_r */
+        cr2_S0        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S0, wco_vdw_S0));
 #ifndef HALF_LJ
-        cr2_S2        = gmx_simd_mul_r(lje_c2_S, rsq_S2);
+        cr2_S2        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S2, wco_vdw_S2));
 #endif
         expmcr2_S0    = gmx_simd_exp_r(gmx_simd_mul_r(mone_S, cr2_S0));
 #ifndef HALF_LJ
index 4b7de1678fae416dfdf61dc275fa3f199342375f..c567589ed5319c66148843e9195a12b913ef6fea 100644 (file)
@@ -53,7 +53,7 @@
  * this can be faster when we have defined gmx_simd_blendv_r, i.e. an instruction
  * that selects from two SIMD registers based on the contents of a third.
  */
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES || defined LJ_EWALD_GEOM) && defined GMX_SIMD_HAVE_BLENDV
 /* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
  * With gcc this is slower, except for RF on Sandy Bridge.
  * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
 #endif
 #endif
 
-        cr2_S0        = gmx_simd_mul_r(lje_c2_S, rsq_S0);
-        cr2_S1        = gmx_simd_mul_r(lje_c2_S, rsq_S1);
+        /* Mask for the cut-off to avoid overflow in gmx_simd_exp_r */
+        cr2_S0        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S0, wco_vdw_S0));
+        cr2_S1        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S1, wco_vdw_S1));
 #ifndef HALF_LJ
-        cr2_S2        = gmx_simd_mul_r(lje_c2_S, rsq_S2);
-        cr2_S3        = gmx_simd_mul_r(lje_c2_S, rsq_S3);
+        cr2_S2        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S2, wco_vdw_S2));
+        cr2_S3        = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S3, wco_vdw_S3));
 #endif
         expmcr2_S0    = gmx_simd_exp_r(gmx_simd_mul_r(mone_S, cr2_S0));
         expmcr2_S1    = gmx_simd_exp_r(gmx_simd_mul_r(mone_S, cr2_S1));