Fixed GB interactions for release-4-6
authorErik Lindahl <erik@kth.se>
Sun, 13 Jan 2013 19:21:05 +0000 (11:21 -0800)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 16 Jan 2013 01:16:02 +0000 (02:16 +0100)
Release-4-6 was not using the correct GB kernels,
but the non-GB electrostatics cutoff kernel. In addition,
the accelerated kernels unrolled more than a factor 2 had
a bug that caused a small error in the forces. Both these
issues have been fixed, and the GB results now match
release-4.5.
Note: The GB _radius_ calculations are still only done
in C, but the accelerated options will be re-enabled in
a pending patch (separated for clarity).
Fixes #1096.

Change-Id: I40151b9a9f1920006bfe9a39a8719698a824bfac

fix

Change-Id: I45630252d6a9c05a7f3695a7e6d07dcd13e0a25c

35 files changed:
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c
src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c
src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c
src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c
src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre
src/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre
src/mdlib/forcerec.c
src/mdlib/genborn.c

index 1590af00f266bd960a84521a47d0c81561f6f5bf..ded395087ddc7e8cf961634fad8410e7115a42e2 100644 (file)
@@ -384,6 +384,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -790,6 +791,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 1180dd8ae2b23ddbc918914e1f8f59b7c2fb4c09..0436e20856525f2cbf560a9646156d1eb8c7325f 100644 (file)
@@ -342,6 +342,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -687,6 +688,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 7cea363fd61fd663a2c0fa72ec95dc779d78b256..82f53a225bc9dda9858626aa557b010ba02edbb6 100644 (file)
@@ -312,6 +312,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -621,6 +622,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 79543b4a3a5034c09e177a29b91f34ed37cd3de0..715156551cfd002047311387c1791e7ae7dbfd94 100644 (file)
@@ -626,6 +626,9 @@ void
             FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r{I}{J},vgb));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
+            /*                 #endif */
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             /*             #if ROUND == 'Loop' */
             gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
index 647bb39abdceab628b5c2d324f29305b7839d65c..2a4b32539c70122bbaf231b02bbc7c87f099dc79 100644 (file)
@@ -437,6 +437,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -903,6 +904,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 1d116d13c7e8e9a2e74c12151599fb383027e4ba..20a1dc49fbc10505784f613775c91d1e5eb077c4 100644 (file)
@@ -397,6 +397,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -806,6 +807,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 7a1739a7d5672ef26039892cb3b51c91138deebe..29a57fce779dc19ec97fb660a2750e424df4f9e6 100644 (file)
@@ -355,6 +355,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -716,6 +717,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 9b422fc672694d13e1424760e9284f9596803722..c818d5b368ec20c7f11fb9b7b7c92e4f5971fb0f 100644 (file)
@@ -630,6 +630,9 @@ void
             FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r{I}{J},vgb));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
+            /*                 #endif */
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /*                 #if ROUND == 'Loop' */
             fjptrA           = dvda+jnrA;
index 93dddf903f5196b121cc8b5741e7473d1dd8d239..842ae1ed36a876869c46f7fd9b9170cb5b3be716 100644 (file)
@@ -429,6 +429,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -892,6 +893,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index ceac3dcc4f6f8b3bae5afd71033211796cbe0a5b..eb117a2e90c490cd608a7bae22abf05a97f2a5a9 100644 (file)
@@ -397,6 +397,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -809,6 +810,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index d240932c4eb824932d4b25954ca6655130729614..907f9efc89f90b0836b0d049ff50ddb6844e9f54 100644 (file)
@@ -355,6 +355,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -719,6 +720,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 81f76e0f160386180aff7f4d3ba34c3092d82e7b..2d1922cface15a4259c813cd10257bcf066cb8ec 100644 (file)
@@ -628,6 +628,9 @@ void
             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
             fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
             dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
+            /*                 #endif */
             dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
             /*                 #if ROUND == 'Loop' */
             fjptrA           = dvda+jnrA;
index 828620dde292d978e33b98a6782b47be07716003..1ac827fc3e43297d4be22fdbe835680930a685f5 100644 (file)
@@ -522,6 +522,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -1095,6 +1096,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 963e6755a3a32e40eb20e87e39674093522428a2..858595f791c3412cc80c0451c06e8f76a1092bcf 100644 (file)
@@ -473,6 +473,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -969,6 +970,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index c78677a444cef44106e0ebbb4da2429f78c494d2..7a7829a65f9e49c8d5f1f9f3bb9fb408135f2c24 100644 (file)
@@ -415,6 +415,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -847,6 +848,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 43b20cf23a515917e1bc2fa2f9db147fb77a09a6..5b320ce4d081b5eb23d7d2983b66f5be8aab8ab3 100644 (file)
@@ -683,6 +683,9 @@ void
             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
             fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
             dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
+            /*                 #endif */
             dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
             /*                 #if ROUND == 'Loop' */
             fjptrA           = dvda+jnrA;
index 553824cb7e627232e1c7d229c425283637808d67..5d41717b3272d3efa7fbd88486d1cbe74653d5cd 100644 (file)
@@ -149,7 +149,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c
         vgbsum           = 0.0;
         vvdwsum          = 0.0;
         dvdasum          = 0.0;
-
+        printf("inr=%d\n",inr);
         /* Start inner kernel loop */
         for(jidx=j_index_start; jidx<j_index_end; jidx++)
         {
@@ -213,9 +213,12 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c
 
             FF               = Fp+Geps+2.0*Heps2;
             fgb              = gbqqfactor*FF*gbscale;
+            printf("  jnr=%d  fgb=%g\n",jnr,fgb);
             dvdatmp          = -0.5*(vgb+fgb*r00);
             dvdasum          = dvdasum + dvdatmp;
+            printf("  dvdatmp=%g\n",dvdatmp);
             dvda[jnr]        = dvdaj+dvdatmp*isaj0*isaj0;
+            printf("  dvda, jcontrib=%g\n",dvdatmp*isaj0*isaj0);
             velec            = qq00*rinv00;
             felec            = (velec*rinv00-fgb)*rinv00;
 
index 551f205a343be545773b4234999d27d79c7d8267..70bbd680c4db84f5e47979e1031ac1c23af36dec 100644 (file)
@@ -370,6 +370,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -766,6 +767,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 32ef729f1a743f0f743d4ee81fb8b00045f09c61..17c4726821b208b9ffddedd2142909c41fd9884b 100644 (file)
@@ -336,6 +336,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -677,6 +678,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 6adfa8ed872eb4afb42598d2dc5a3fb674561162..79660d0d0f563dcd0938bc94e6f5254166b6d486 100644 (file)
@@ -306,6 +306,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -611,6 +612,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 968cf64f0085f27b9a77c0f2cc2533a93bd80855..b98c5965f607c7496af16aea51d7a6983a2d508e 100644 (file)
@@ -617,6 +617,9 @@ void
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
+            /*                 #endif */
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             /*             #if ROUND == 'Loop' */
             gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
index 0b271c51fb2dc1e958805b40eec616b7b5254891..fd4ceeda2f973d899d6887bf9fdd4b06eb6b25be 100644 (file)
@@ -423,6 +423,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -879,6 +880,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 1f559cea4536a54d7cf17aaf85d060d11814d760..fa59ea9b03a0b77d604dda152fe952d475600644 100644 (file)
@@ -391,6 +391,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -796,6 +797,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 2ba293f393b308d7c82b4d3d5c464362e1311d18..76118bc5032c82dd89e92888b3c1d0d5c13a581a 100644 (file)
@@ -349,6 +349,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -706,6 +707,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 572b896d988d1b27fffb43270c328cd70598502f..8e5d4146e62fb784c6f58fe60bf699491a8a9ff6 100644 (file)
@@ -617,6 +617,9 @@ void
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
+            /*                 #endif */
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /*                 #if ROUND == 'Loop' */
             fjptrA           = dvda+jnrA;
index ac3ee765dfa852b500f0fec31576deaddd5e07b3..bcc54baf060848aedb23238b225ee6258fff8c41 100644 (file)
@@ -370,6 +370,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -766,6 +767,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 48251a8ea484280320161397e1f66967e9d8b5d6..75a8a9a8d24e699d9e3c5832964d5f73e34022f4 100644 (file)
@@ -336,6 +336,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -677,6 +678,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 932e7669c4b87f0a56ba3f37c96cb415800aca76..988834b9fbcd76da59de0a6581640a16035de601 100644 (file)
@@ -306,6 +306,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
@@ -611,6 +612,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
             velec            = _mm_mul_pd(qq00,rinv00);
index 3548d76cb638a8bcdb078ee079265d1f6571ebb3..9c84a4233f9892369d0a0a93f8158716e5b38c04 100644 (file)
@@ -617,6 +617,9 @@ void
             FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
             fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
             dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
+            /*                 #endif */
             dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
             /*             #if ROUND == 'Loop' */
             gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
index ab5c1b5ab2d832a9e0cf0772e4735a430d08f9e1..63810cd6a8a816d4b12d6fbf8bd5ef40a14cab76 100644 (file)
@@ -421,6 +421,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -875,6 +876,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 8dd0cf706e92cc2cd3c060a31dbf55dc9fd543a9..32294408bcc97e5bda387639805000b1b9f87e87 100644 (file)
@@ -389,6 +389,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -792,6 +793,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 5e2a9a6555cf233a81904df8f0ad3703f60406da..9765943d07f40e61f4e13a8b88abd7b7cf6ecbdf 100644 (file)
@@ -347,6 +347,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
@@ -702,6 +703,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
index 975f9e7e47fb4e5cf7dd4e75706a636cba9345f7..c30b3d7e2b8f112a708d83e75f821ab6340e2b95 100644 (file)
@@ -616,6 +616,9 @@ void
             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
+            /*                 #if ROUND == 'Epilogue' */
+            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
+            /*                 #endif */
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
             /*                 #if ROUND == 'Loop' */
             fjptrA           = dvda+jnrA;
index 3b107c9dbd9f5ad2420e26350e0fbd8176899fb6..1b89d9ac396012047fc7fde0fca6f4b7e634ba14 100644 (file)
@@ -2161,6 +2161,7 @@ void init_forcerec(FILE *fp,
             fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
         }
     }
+    fr->bGB = (ir->implicit_solvent == eisGBSA);
 
     fr->rc_scaling = ir->refcoord_scaling;
     copy_rvec(ir->posres_com,fr->posres_com);
@@ -2177,7 +2178,7 @@ void init_forcerec(FILE *fp,
     switch(fr->eeltype)
     {
         case eelCUT:
-            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_COULOMB;
+            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
             break;
 
         case eelRF:
@@ -2428,7 +2429,6 @@ void init_forcerec(FILE *fp,
         set_bham_b_max(fp,fr,mtop);
     }
 
-    fr->bGB = (ir->implicit_solvent == eisGBSA);
        fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 
     /* Copy the GBSA data (radius, volume and surftens for each
index ae95462fcb52c986465fdbf00296f65bb3b802e3..c2cd4a7f5ee627a860cd48d60a3840c510d1ad02 100644 (file)
@@ -498,22 +498,22 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
     real rinv,idr2,idr6,vaj,dccf,cosq,sinq,prod,gpi2;
     real factor;
     real vai, prod_ai, icf4,icf6;
-    
+
     factor  = 0.5*ONE_4PI_EPS0;
     n       = 0;
-    
+
     for(i=0;i<born->nr;i++)
     {
         born->gpol_still_work[i]=0;
     }
-     
-       for(i=0;i<nl->nri;i++ )
+
+    for(i=0;i<nl->nri;i++ )
     {
         ai      = nl->iinr[i];
-        
+
         nj0     = nl->jindex[i];            
         nj1     = nl->jindex[i+1];
-    
+
         /* Load shifts for this list */
         shift   = nl->shift[i];
         shX     = fr->shift_vec[shift][0];
@@ -530,8 +530,8 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
         ix1     = shX + x[ai][0];
         iy1     = shY + x[ai][1];
         iz1     = shZ + x[ai][2];
-                        
-        for(k=nj0;k<nj1;k++)
+
+        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
         {
             aj    = nl->jjnr[k];
             jx1   = x[aj][0];
@@ -555,7 +555,7 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
             ratio = dr2 / (rvdw * rvdw);
             vaj   = born->vsolv[aj];
             
-            if(ratio>STILL_P5INV) 
+            if(ratio>STILL_P5INV)
             {
                 ccf=1.0;
                 dccf=0.0;
@@ -573,7 +573,6 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
             prod          = STILL_P4*vaj;
             icf4          = ccf*idr4;
             icf6          = (4*ccf-dccf)*idr6;
-
             born->gpol_still_work[aj] += prod_ai*icf4;
             gpi             = gpi+prod*icf4;
             
@@ -599,7 +598,6 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
     {
                if(born->use[i] != 0)
         {
-               
             gpi  = born->gpol[i]+born->gpol_still_work[i];
             gpi2 = gpi * gpi;
             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
@@ -673,7 +671,7 @@ calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_localtop_t *top,
         
         sum_ai  = 0;
         
-        for(k=nj0;k<nj1;k++)
+        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
         {
             aj    = nl->jjnr[k];
             
@@ -893,7 +891,7 @@ calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
         
         sum_ai   = 0;
         
-        for(k=nj0;k<nj1;k++)
+        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
         {
             aj    = nl->jjnr[k];
             
@@ -1072,7 +1070,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,gmx_localtop_t *to
     real *p;
     int   cnt;
     int ndadx;
-
+    
     if(fr->bAllvsAll && fr->dadx==NULL)
     {
         /* We might need up to 8 atoms of padding before and after, 
@@ -1260,7 +1258,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,gmx_localtop_t *to
     switch(ir->gb_algorithm)
     {
         case egbSTILL:
-            calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); 
+            calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md);
             break;
         case egbHCT:
             calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); 
@@ -1573,7 +1571,7 @@ real calc_gb_chainrule(int natoms, t_nblist *nl, real *dadx, real *dvda, rvec x[
         
         rbai = rb[ai];
         
-        for(k=nj0;k<nj1;k++)
+        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
         {
             aj = nl->jjnr[k];