Merge release-5-0 into master
authorMark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
Conflicts:
CMakeLists.txt
Version numbers not bumped; fixed to use the right
name for RelWithDebInfo.

cmake/gmxCFlags.cmake
Fixed to use the right name for RelWithDebInfo.

src/gromacs/listed-forces/bonded.cpp
New RB SIMD function in bonded.cpp had unused variables, now
eliminated

src/gromacs/mdlib/domdec.cpp
Bug fixes from release-5-0 incorporated. std::max now used in code
newly arrived from release-5-0.

md.cpp had no conflict, but fr->nbv->bUseGPU had to
be replaced by use_GPU(fr->nbv) to work in master branch.

Change-Id: I65326b691745111fbdaa9435be6c92fa1acf6e7d

25 files changed:
1  2 
CMakeLists.txt
cmake/gmxCFlags.cmake
src/gromacs/gmxana/gmx_tune_pme.c
src/gromacs/gmxlib/calcgrid.c
src/gromacs/gmxpreprocess/toppush.c
src/gromacs/legacyheaders/domdec.h
src/gromacs/listed-forces/bonded.cpp
src/gromacs/mdlib/domdec.cpp
src/gromacs/mdlib/force.c
src/gromacs/mdlib/mdatom.c
src/gromacs/mdlib/minimize.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/ns.c
src/gromacs/mdlib/sim_util.c
src/gromacs/mdlib/update.c
src/gromacs/timing/wallcycle.c
src/gromacs/timing/wallcycle.h
src/programs/mdrun/md.cpp
src/programs/mdrun/pme_loadbal.c

diff --cc CMakeLists.txt
index 70185949c4ac8cae13677e083501bd2f14c71124,53498532af1377e335ee0747ace076bf86f54d6a..7fbe947522f707b751ec25248446b94e39af19e4
@@@ -81,10 -128,34 +81,10 @@@ if(CMAKE_CONFIGURATION_TYPES
          "List of configuration types"
          FORCE)
  endif()
- set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
 -set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
  
 -enable_language(C)
 -enable_language(CXX)
  set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
  
 -set(CPACK_PACKAGE_NAME "gromacs")
 -set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
 -set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}")
 -set(CPACK_PACKAGE_VENDOR "gromacs.org")
 -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gromacs - a toolkit for high-performance molecular simulation")
 -set(CPACK_RESOURCE_FILE_WELCOME "${CMAKE_SOURCE_DIR}/admin/InstallWelcome.txt")
 -# Its GPL/LGPL, so they do not have to agree to a license for mere usage, but some installers require this...
 -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING")
 -set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/admin/InstallInfo.txt")
 -set(CPACK_SOURCE_IGNORE_FILES "\\\\.isreposource$;\\\\.git/;\\\\.gitignore$;\\\\.gitattributes;")
 -set(CPACK_PROJECT_CONFIG_FILE "${CMAKE_SOURCE_DIR}/CPackInit.cmake")
 -# CPack source archives include only the directories we list here.
 -# This variable is a list of pairs of names of source and destination
 -# directories. Most of these are used for content GROMACS generates as
 -# part of the configuration or build.
 -set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${CMAKE_BINARY_DIR}/src/programs/completion;src/programs/completion;${CMAKE_BINARY_DIR}/docs/man/man1;docs/man/man1;${CMAKE_BINARY_DIR}/docs/man/man7;docs/man/man7;${CMAKE_BINARY_DIR}/docs/old-html/final;docs/old-html/final;${CMAKE_BINARY_DIR}/docs/install-guide/final;/")
 -set(CPACK_PACKAGE_CONTACT "gmx-users@gromacs.org")
 -set(CPACK_GMX_BUILD_HELP "${GMX_BUILD_HELP}") #Works even though GMX_BUILD_HELP is defined later because it is off by default.
 -
 -#must come after all cpack settings!
 -include(CPack)
 -
  # Set a default valgrind suppression file.
  # This unfortunately needs to duplicate information from CTest to work as
  # expected...
index 011cada36e16dfdbe6eb9008041156c85e4c070d,124107b65c41c8105f528ebb91ae1c6e3a31cb09..3a0e3bbe15428b144c8fc1510112d46ee442b4a3
@@@ -64,7 -64,7 +64,7 @@@ function(gmx_set_cmake_compiler_flags
          # be set up elsewhere and passed to this function, but it is
          # inconvenient in CMake to pass more than one list, and such a
          # list is only used here.
-         foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
 -        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
              set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
          endforeach()
          # Copy the flags that are only used by the real Release build
Simple merge
Simple merge
Simple merge
Simple merge
index b1ded380a424d9dcc554099fae8883af00314e12,0eb8d55791b983c0b5909786b7dffdb1c5ef462a..dfb846d9e700d1492bd6314d68537a9e21fb8860
@@@ -2117,6 -2103,155 +2117,152 @@@ pdihs_noener_simd(int nbonds
      }
  }
  
 -    real                  ddphi;
+ /* This is mostly a copy of pdihs_noener_simd above, but with using
+  * the RB potential instead of a harmonic potential.
+  * This function can replace rbdihs() when no energy and virial are needed.
+  */
+ static void
+ rbdihs_noener_simd(int nbonds,
+                    const t_iatom forceatoms[], const t_iparams forceparams[],
+                    const rvec x[], rvec f[],
+                    const t_pbc *pbc, const t_graph gmx_unused *g,
+                    real gmx_unused lambda,
+                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                    int gmx_unused *global_atom_index)
+ {
+     const int             nfa1 = 5;
+     int                   i, iu, s, j;
+     int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
 -    real                 *parm, *phi, *p, *q, *sf_i, *msf_l;
+     real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+     real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
 -    sf_i  = buf + (NR_RBDIHS + 2)*GMX_SIMD_REAL_WIDTH;
 -    msf_l = buf + (NR_RBDIHS + 3)*GMX_SIMD_REAL_WIDTH;
++    real                 *parm, *p, *q;
+     gmx_simd_real_t       phi_S;
+     gmx_simd_real_t       ddphi_S, cosfac_S;
+     gmx_simd_real_t       mx_S, my_S, mz_S;
+     gmx_simd_real_t       nx_S, ny_S, nz_S;
+     gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+     gmx_simd_real_t       parm_S, c_S;
+     gmx_simd_real_t       sin_S, cos_S;
+     gmx_simd_real_t       sf_i_S, msf_l_S;
+     pbc_simd_t            pbc_simd;
+     gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+     gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+     /* Ensure SIMD register alignment */
+     dr  = gmx_simd_align_r(dr_array);
+     buf = gmx_simd_align_r(buf_array);
+     /* Extract aligned pointer for parameters and variables */
+     parm  = buf;
+     p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+     q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+     set_pbc_simd(pbc, &pbc_simd);
+     /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+     for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+     {
+         /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+          * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+          */
+         iu = i;
+         for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+         {
+             type  = forceatoms[iu];
+             ai[s] = forceatoms[iu+1];
+             aj[s] = forceatoms[iu+2];
+             ak[s] = forceatoms[iu+3];
+             al[s] = forceatoms[iu+4];
+             /* We don't need the first parameter, since that's a constant
+              * which only affects the energies, not the forces.
+              */
+             for (j = 1; j < NR_RBDIHS; j++)
+             {
+                 parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                     forceparams[type].rbdihs.rbcA[j];
+             }
+             /* At the end fill the arrays with identical entries */
+             if (iu + nfa1 < nbonds)
+             {
+                 iu += nfa1;
+             }
+         }
+         /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+         dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                        dr,
+                        &phi_S,
+                        &mx_S, &my_S, &mz_S,
+                        &nx_S, &ny_S, &nz_S,
+                        &nrkj_m2_S,
+                        &nrkj_n2_S,
+                        p, q);
+         /* Change to polymer convention */
+         phi_S = gmx_simd_sub_r(phi_S, pi_S);
+         gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+         ddphi_S   = gmx_simd_setzero_r();
+         c_S       = one_S;
+         cosfac_S  = one_S;
+         for (j = 1; j < NR_RBDIHS; j++)
+         {
+             parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+             ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+             cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+             c_S      = gmx_simd_add_r(c_S, one_S);
+         }
+         /* Note that here we do not use the minus sign which is present
+          * in the normal RB code. This is corrected for through (m)sf below.
+          */
+         ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+         sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+         msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+         /* After this m?_S will contain f[i] */
+         mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+         my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+         mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+         /* After this m?_S will contain -f[l] */
+         nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+         ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+         nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+         gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+         gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+         gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+         gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+         gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+         gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+         iu = i;
+         s  = 0;
+         do
+         {
+             do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                         p[s], q[s],
+                                         dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                         f);
+             s++;
+             iu += nfa1;
+         }
+         while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+     }
+ }
  #endif /* GMX_SIMD_HAVE_REAL */
  
  
index 13934d5fa68fd157027fac7fc3d4cb8b84662b6a,6cc6b73895d08a33eadcd83abcb492bd2dffcbef..06f2602916cdedd2b27728612b90bd533f09eadb
@@@ -6739,6 -6754,13 +6748,13 @@@ gmx_domdec_t *init_domain_decomposition
      comm->cellsize_limit = 0;
      comm->bBondComm      = FALSE;
  
 -    comm->cellsize_limit = max(comm->cellsize_limit,
 -                               ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
+     /* Atoms should be able to move by up to half the list buffer size (if > 0)
+      * within nstlist steps. Since boundaries are allowed to displace by half
+      * a cell size, DD cells should be at least the size of the list buffer.
+      */
++    comm->cellsize_limit = std::max(comm->cellsize_limit,
++                                    ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
      if (comm->bInterCGBondeds)
      {
          if (comm_distance_min > 0)
@@@ -9332,9 -9374,10 +9362,9 @@@ void dd_partition_system(FIL
               * and every 100 partitionings,
               * so the extra communication cost is negligible.
               */
-             n         = std::max(100, nstglobalcomm);
+             const int nddp_chk_dlb = 100;
 -
              bCheckDLB = (comm->n_load_collect == 0 ||
-                          comm->n_load_have % n == n-1);
+                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
          }
          else
          {
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 9864f8789f9385d2e536ed33f5988bacff85186e,3d98d597c7d0e9c009e02d816e631b3f36846a25..0504150c008e23e09337d2774f24662f049a6474
@@@ -1881,6 -1909,21 +1881,21 @@@ double do_md(FILE *fplog, t_commrec *cr
                      }
                      dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
  
 -                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                     if (bPMETuneRunning &&
++                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+                         !(cr->duty & DUTY_PME))
+                     {
+                         /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                          * With GPUs + separate PME ranks, we don't want DLB.
+                          * This could happen when we scan coarse grids and
+                          * it would then never be turned off again.
+                          * This would hurt performance at the final, optimal
+                          * grid spacing, where DLB almost never helps.
+                          * Also, DLB can limit the cut-off for PME tuning.
+                          */
+                         dd_dlb_set_lock(cr->dd, TRUE);
+                     }
                      if (bPMETuneRunning || step_rel > ir->nstlist*50)
                      {
                          bPMETuneTry     = FALSE;
Simple merge