From fa181964de6c9e064fe474940c78e00bfd96117f Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Sun, 13 Jan 2013 11:21:05 -0800 Subject: [PATCH] Fixed GB interactions for release-4-6 Release-4-6 was not using the correct GB kernels, but the non-GB electrostatics cutoff kernel. In addition, the accelerated kernels unrolled more than a factor 2 had a bug that caused a small error in the forces. Both these issues have been fixed, and the GB results now match release-4.5. Note: The GB _radius_ calculations are still only done in C, but the accelerated options will be re-enabled in a pending patch (separated for clarity). Fixes #1096. Change-Id: I40151b9a9f1920006bfe9a39a8719698a824bfac fix Change-Id: I45630252d6a9c05a7f3695a7e6d07dcd13e0a25c --- ...cGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c | 2 ++ ...ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c | 2 ++ ...ecGB_VdwNone_GeomP1P1_avx_128_fma_double.c | 2 ++ .../nb_kernel_template_avx_128_fma_double.pre | 3 ++ ...cGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c | 2 ++ ...ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c | 2 ++ ...ecGB_VdwNone_GeomP1P1_avx_128_fma_single.c | 2 ++ .../nb_kernel_template_avx_128_fma_single.pre | 3 ++ ..._ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c | 2 ++ ...nel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c | 2 ++ ...l_ElecGB_VdwNone_GeomP1P1_avx_256_double.c | 2 ++ .../nb_kernel_template_avx_256_double.pre | 3 ++ ..._ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c | 2 ++ ...nel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c | 2 ++ ...l_ElecGB_VdwNone_GeomP1P1_avx_256_single.c | 2 ++ .../nb_kernel_template_avx_256_single.pre | 3 ++ .../nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c | 5 +++- ...nel_ElecGB_VdwCSTab_GeomP1P1_sse2_double.c | 2 ++ ...kernel_ElecGB_VdwLJ_GeomP1P1_sse2_double.c | 2 ++ ...rnel_ElecGB_VdwNone_GeomP1P1_sse2_double.c | 2 ++ .../nb_kernel_template_sse2_double.pre | 3 ++ ...nel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c | 2 ++ ...kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c | 2 ++ ...rnel_ElecGB_VdwNone_GeomP1P1_sse2_single.c | 2 ++ .../nb_kernel_template_sse2_single.pre | 3 ++ ...l_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c | 2 ++ ...rnel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c | 2 ++ ...el_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c | 2 ++ .../nb_kernel_template_sse4_1_double.pre | 3 ++ ...l_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c | 2 ++ ...rnel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c | 2 ++ ...el_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c | 2 ++ .../nb_kernel_template_sse4_1_single.pre | 3 ++ src/mdlib/forcerec.c | 4 +-- src/mdlib/genborn.c | 30 +++++++++---------- 35 files changed, 92 insertions(+), 19 deletions(-) diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c index 1590af00f2..ded395087d 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c @@ -384,6 +384,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -790,6 +791,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c index 1180dd8ae2..0436e20856 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c @@ -342,6 +342,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -687,6 +688,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c index 7cea363fd6..82f53a225b 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c @@ -312,6 +312,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -621,6 +622,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb)); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre index 79543b4a3a..715156551c 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre @@ -626,6 +626,9 @@ void FF = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r{I}{J},vgb)); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); + /* #endif */ dvdasum = _mm_add_pd(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J}))); diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c index 647bb39abd..2a4b32539c 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c @@ -437,6 +437,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -903,6 +904,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c index 1d116d13c7..20a1dc49fb 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c @@ -397,6 +397,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -806,6 +807,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c index 7a1739a7d5..29a57fce77 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c @@ -355,6 +355,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -716,6 +717,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb)); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre index 9b422fc672..c818d5b368 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre +++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre @@ -630,6 +630,9 @@ void FF = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r{I}{J},vgb)); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); + /* #endif */ dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ fjptrA = dvda+jnrA; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c index 93dddf903f..842ae1ed36 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c @@ -429,6 +429,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -892,6 +893,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c index ceac3dcc4f..eb117a2e90 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c @@ -397,6 +397,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -809,6 +810,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c index d240932c4e..907f9efc89 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c @@ -355,6 +355,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -719,6 +720,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre index 81f76e0f16..2d1922cfac 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre @@ -628,6 +628,9 @@ void FF = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps)))); fgb = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale)); dvdatmp = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r{I}{J}))); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); + /* #endif */ dvdasum = _mm256_add_pd(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ fjptrA = dvda+jnrA; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c index 828620dde2..1ac827fc3e 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c @@ -522,6 +522,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -1095,6 +1096,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c index 963e6755a3..858595f791 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c @@ -473,6 +473,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -969,6 +970,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c index c78677a444..7a7829a65f 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c @@ -415,6 +415,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -847,6 +848,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00))); + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre index 43b20cf23a..5b320ce4d0 100644 --- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre +++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre @@ -683,6 +683,9 @@ void FF = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps)))); fgb = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale)); dvdatmp = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r{I}{J}))); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm256_andnot_ps(dummy_mask,dvdatmp); + /* #endif */ dvdasum = _mm256_add_ps(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ fjptrA = dvda+jnrA; diff --git a/src/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c b/src/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c index 553824cb7e..5d41717b32 100644 --- a/src/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c +++ b/src/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c @@ -149,7 +149,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c vgbsum = 0.0; vvdwsum = 0.0; dvdasum = 0.0; - + printf("inr=%d\n",inr); /* Start inner kernel loop */ for(jidx=j_index_start; jidx=0) ? dvda+jnrA : scratch; @@ -879,6 +880,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c index 1f559cea45..fa59ea9b03 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c @@ -391,6 +391,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -796,6 +797,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c index 2ba293f393..76118bc503 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c @@ -349,6 +349,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -706,6 +707,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre index 572b896d98..8e5d4146e6 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre @@ -617,6 +617,9 @@ void FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J}))); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); + /* #endif */ dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ fjptrA = dvda+jnrA; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c index ac3ee765df..bcc54baf06 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c @@ -370,6 +370,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -766,6 +767,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c index 48251a8ea4..75a8a9a8d2 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c @@ -336,6 +336,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -677,6 +678,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c index 932e7669c4..988834b9fb 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c @@ -306,6 +306,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); @@ -611,6 +612,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00))); + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); dvdasum = _mm_add_pd(dvdasum,dvdatmp); gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0))); velec = _mm_mul_pd(qq00,rinv00); diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre index 3548d76cb6..9c84a4233f 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre @@ -617,6 +617,9 @@ void FF = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps)))); fgb = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale)); dvdatmp = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r{I}{J}))); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd()); + /* #endif */ dvdasum = _mm_add_pd(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J}))); diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c index ab5c1b5ab2..63810cd6a8 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c @@ -421,6 +421,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -875,6 +876,7 @@ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c index 8dd0cf706e..32294408bc 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c @@ -389,6 +389,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -792,6 +793,7 @@ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c index 5e2a9a6555..9765943d07 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c @@ -347,6 +347,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; @@ -702,6 +703,7 @@ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00))); + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch; diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre index 975f9e7e47..c30b3d7e2b 100644 --- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre +++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre @@ -616,6 +616,9 @@ void FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps)))); fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale)); dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J}))); + /* #if ROUND == 'Epilogue' */ + dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp); + /* #endif */ dvdasum = _mm_add_ps(dvdasum,dvdatmp); /* #if ROUND == 'Loop' */ fjptrA = dvda+jnrA; diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c index 3b107c9dbd..1b89d9ac39 100644 --- a/src/mdlib/forcerec.c +++ b/src/mdlib/forcerec.c @@ -2161,6 +2161,7 @@ void init_forcerec(FILE *fp, fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC); } } + fr->bGB = (ir->implicit_solvent == eisGBSA); fr->rc_scaling = ir->refcoord_scaling; copy_rvec(ir->posres_com,fr->posres_com); @@ -2177,7 +2178,7 @@ void init_forcerec(FILE *fp, switch(fr->eeltype) { case eelCUT: - fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_COULOMB; + fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB; break; case eelRF: @@ -2428,7 +2429,6 @@ void init_forcerec(FILE *fp, set_bham_b_max(fp,fr,mtop); } - fr->bGB = (ir->implicit_solvent == eisGBSA); fr->gb_epsilon_solvent = ir->gb_epsilon_solvent; /* Copy the GBSA data (radius, volume and surftens for each diff --git a/src/mdlib/genborn.c b/src/mdlib/genborn.c index ae95462fcb..c2cd4a7f5e 100644 --- a/src/mdlib/genborn.c +++ b/src/mdlib/genborn.c @@ -498,22 +498,22 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top, real rinv,idr2,idr6,vaj,dccf,cosq,sinq,prod,gpi2; real factor; real vai, prod_ai, icf4,icf6; - + factor = 0.5*ONE_4PI_EPS0; n = 0; - + for(i=0;inr;i++) { born->gpol_still_work[i]=0; } - - for(i=0;inri;i++ ) + + for(i=0;inri;i++ ) { ai = nl->iinr[i]; - + nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; - + /* Load shifts for this list */ shift = nl->shift[i]; shX = fr->shift_vec[shift][0]; @@ -530,8 +530,8 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top, ix1 = shX + x[ai][0]; iy1 = shY + x[ai][1]; iz1 = shZ + x[ai][2]; - - for(k=nj0;kjjnr[k]>=0;k++) { aj = nl->jjnr[k]; jx1 = x[aj][0]; @@ -555,7 +555,7 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top, ratio = dr2 / (rvdw * rvdw); vaj = born->vsolv[aj]; - if(ratio>STILL_P5INV) + if(ratio>STILL_P5INV) { ccf=1.0; dccf=0.0; @@ -573,7 +573,6 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top, prod = STILL_P4*vaj; icf4 = ccf*idr4; icf6 = (4*ccf-dccf)*idr6; - born->gpol_still_work[aj] += prod_ai*icf4; gpi = gpi+prod*icf4; @@ -599,7 +598,6 @@ calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top, { if(born->use[i] != 0) { - gpi = born->gpol[i]+born->gpol_still_work[i]; gpi2 = gpi * gpi; born->bRad[i] = factor*gmx_invsqrt(gpi2); @@ -673,7 +671,7 @@ calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_localtop_t *top, sum_ai = 0; - for(k=nj0;kjjnr[k]>=0;k++) { aj = nl->jjnr[k]; @@ -893,7 +891,7 @@ calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, sum_ai = 0; - for(k=nj0;kjjnr[k]>=0;k++) { aj = nl->jjnr[k]; @@ -1072,7 +1070,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,gmx_localtop_t *to real *p; int cnt; int ndadx; - + if(fr->bAllvsAll && fr->dadx==NULL) { /* We might need up to 8 atoms of padding before and after, @@ -1260,7 +1258,7 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,gmx_localtop_t *to switch(ir->gb_algorithm) { case egbSTILL: - calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); + calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); break; case egbHCT: calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); @@ -1573,7 +1571,7 @@ real calc_gb_chainrule(int natoms, t_nblist *nl, real *dadx, real *dvda, rvec x[ rbai = rb[ai]; - for(k=nj0;kjjnr[k]>=0;k++) { aj = nl->jjnr[k]; -- 2.22.0