Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Sun, 20 Jan 2013 23:41:51 +0000 (18:41 -0500)
committerRoland Schulz <roland@utk.edu>
Sun, 20 Jan 2013 23:49:12 +0000 (18:49 -0500)
Conflicts:
CMakeLists.txt (OPENMM, regressiontests)
COPYING (OpenMM, renumbering)
cmake/FortranCInterface.cmake
include/CMakeLists.txt (F77, not needed)
include/disre.h (rename detection failed - no conflict)
include/types/iteratedconstraints.h (rename detection failed -
     no conflict)

both here and src/kernel new - only contrib kep:
src/contrib/md_openmm.c
src/contrib/md_openmm.h
src/contrib/openmm_wrapper.cpp
src/contrib/openmm_wrapper.h
src/kernel/md_openmm.c
src/kernel/openmm_wrapper.cpp
src/kernel/openmm_wrapper.h
src/programs/mdrun/md_openmm.c
src/programs/mdrun/openmm_wrapper.cpp
src/programs/mdrun/openmm_wrapper.h
src/programs/mdrun/runner.c

src/gmxlib/rando.c (rename detection failed - no conflict)
src/gromacs/gmxlib/main.c
src/gromacs/gmxlib/string2.c
src/gromacs/gmxlib/tpxio.c (incrememted to 92 - updated if)
src/gromacs/gmxlib/vmdio.c
src/gromacs/legacyheaders/domdec.h (removed export)
src/gromacs/legacyheaders/gmx_omp_nthreads.h
src/gromacs/legacyheaders/main.h
src/gromacs/legacyheaders/names.h (removed export)
src/gromacs/linearalgebra/eigensolver.c
src/gromacs/linearalgebra/gmx_arpack.c
src/gromacs/linearalgebra/gmx_arpack.h
src/gromacs/linearalgebra/gmx_lapack.h (removed #define F77)
src/kernel/CMakeLists.txt (OpenMM changes moved to
       src/programs/mdrun/CMakeLists.txt)
src/tools/CMakeLists.txt
src/tools/gmx_bar.c
src/tools/gmx_covar.c
tests/CMakeLists.txt (fixed paths)

Extra changes:
src/config.h.cmakein (removed double gmx_header_config.h include)
src/gromacs/CMakeLists.txt (removed OpenMM )
        src/gromacs/linearalgebra/gmx_blas.h (removed #define F77)

Change-Id: I04fb402d503c8d5d97ee364c598e0f474956052f

105 files changed:
1  2 
CMakeLists.txt
COPYING
cmake/FindFFTW.cmake
cmake/gmxCFlags.cmake
cmake/gmxGetCompilerInfo.cmake
cmake/gmxManageGPU.cmake
src/config.h.cmakein
src/contrib/CMakeLists.txt
src/contrib/FindOpenMM.cmake
src/gromacs/CMakeLists.txt
src/gromacs/gmxlib/cinvsqrtdata.c
src/gromacs/gmxlib/disre.c
src/gromacs/gmxlib/gmx_detect_hardware.c
src/gromacs/gmxlib/gmx_omp_nthreads.c
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
src/gromacs/gmxlib/libxdrf.c
src/gromacs/gmxlib/main.c
src/gromacs/gmxlib/mtop_util.c
src/gromacs/gmxlib/names.c
src/gromacs/gmxlib/network.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_128_fma_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_128_fma_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_128_fma_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_avx_256_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_avx_256_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_avx_256_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecGB_VdwLJ_GeomP1P1_c.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse4_1_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse4_1_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre
src/gromacs/gmxlib/orires.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxlib/txtdump.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/disre.h
src/gromacs/legacyheaders/domdec.h
src/gromacs/legacyheaders/gmx_omp_nthreads.h
src/gromacs/legacyheaders/gpu_utils.h
src/gromacs/legacyheaders/main.h
src/gromacs/legacyheaders/mdebin.h
src/gromacs/legacyheaders/mdrun.h
src/gromacs/legacyheaders/names.h
src/gromacs/legacyheaders/types/hw_info.h
src/gromacs/legacyheaders/types/inputrec.h
src/gromacs/legacyheaders/types/iteratedconstraints.h
src/gromacs/linearalgebra/eigensolver.c
src/gromacs/linearalgebra/gmx_arpack.c
src/gromacs/linearalgebra/gmx_arpack.h
src/gromacs/linearalgebra/gmx_blas.h
src/gromacs/linearalgebra/gmx_lapack.h
src/gromacs/mdlib/coupling.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/genborn.c
src/gromacs/mdlib/iteratedconstraints.c
src/gromacs/mdlib/mdebin.c
src/gromacs/mdlib/mdebin_bar.c
src/gromacs/mdlib/mdebin_bar.h
src/gromacs/mdlib/minimize.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/qm_gamess.c
src/gromacs/mdlib/qm_mopac.c
src/gromacs/mdlib/update.c
src/programs/gmxcheck/tpbcmp.c
src/programs/gmxdump/gmxdump.c
src/programs/mdrun/CMakeLists.txt
src/programs/mdrun/md.c
src/programs/mdrun/mdrun.c
src/programs/mdrun/membed.c
src/programs/mdrun/pme_loadbal.c
src/programs/mdrun/repl_ex.c
src/programs/mdrun/runner.c
src/tools/gmx_bar.c
src/tools/gmx_disre.c
src/tools/gmx_energy.c
src/tools/gmx_membed.c
src/tools/gmx_trjconv.c
src/tools/gmx_tune_pme.c
tests/CMakeLists.txt

diff --cc CMakeLists.txt
index c81554dda3d8c8b79af41ff4acaea356381bd1d0,41ea6a8d197aa5283d248196ca1f655dca22ebf4..6c8f22e99a1089036b0c7b61a3fbfcf8082558e1
@@@ -191,8 -225,15 +191,13 @@@ mark_as_advanced(GMX_SKIP_DEFAULT_CFLAG
  # These files should be removed from the source tree when a CMake version that
  # includes the features in question becomes required for building GROMACS.
  include(CheckCCompilerFlag)
 -if(CMAKE_CXX_COMPILER_LOADED)
 -    include(CheckCXXCompilerFlag)
 -endif()
 +include(CheckCXXCompilerFlag)
  
+ # Get compiler version information, needs to be done early as check that depend
+ # on compiler verison follow below.
+ include(gmxGetCompilerInfo)
+ get_compiler_version()
  # First exclude compilers known to not work with OpenMP although claim to support it:
  # gcc 4.2.1 and gcc-llvm 4.2.1 (also claims to be 4.2.1) on Mac OS X
  # This fixes redmine 900 and needs to run before OpenMP flags are set below.
@@@ -541,19 -553,9 +491,9 @@@ if(GMX_THREAD_MPI
      set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_THREAD_MPI")
      set(GMX_MPI 1)
  else(GMX_THREAD_MPI)
 -    tmpi_get_source_list(THREAD_MPI_SRC NOMPI)
 +    tmpi_get_source_list(THREAD_MPI_SRC CXX NOMPI)
  endif(GMX_THREAD_MPI)
  
- if(GMX_OPENMM)
-     set(CUDA_BUILD_EMULATION OFF)
-     find_package(CUDA 3.1 REQUIRED)
-     add_definitions(-DGMX_OPENMM)
-     if(CMAKE_BUILD_TYPE STREQUAL "DEBUG")    
-         set(CUDA_VERBOSE_BUILD ON)
-     endif()
-     find_package(OpenMM) 
- endif(GMX_OPENMM)
  if(GMX_GPU)
      # now that we have detected the dependencies, do the second configure pass
      gmx_gpu_setup()
@@@ -755,18 -711,20 +695,18 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          endif()
      endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
  
-     GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" ACCELERATION_CXX_FLAGS)
 -        if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
 -        endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG) 
 -            message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 -            # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 -            # intrinsics when SSE2 support is enabled, so we try that instead.
 -            if (GMX_NATIVE_WINDOWS)
 -                GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 -            endif()
 -        endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 -    endif()
++    GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" ACCELERATION_CXX_FLAGS)
 +    if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
 +    endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 +        message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 +        # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 +        # intrinsics when SSE2 support is enabled, so we try that instead.
 +        if (GMX_NATIVE_WINDOWS)
 +            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 +        endif()
 +    endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
  
      # This must come after we have added the -msse4.1 flag on some platforms.
      check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
@@@ -1177,6 -1138,8 +1117,8 @@@ ADD_CUSTOM_TARGET(uninstal
  
  include(CTest)
  mark_as_advanced(BUILD_TESTING)
 -add_custom_target(gmxtests DEPENDS grompp mdrun pdb2gmx gmxcheck editconf)
+ #gmxtests target builds all binaries required for running gmxtest
++add_custom_target(gmxtests DEPENDS grompp mdrun pdb2gmx gmxcheck gmx)
  IF(BUILD_TESTING)
      enable_testing()
      add_subdirectory(tests)
diff --cc COPYING
index d180161274d7785335c0556c42ab2ef4e60e2644,43c15989e4edd30cb685f8b5500ae209203c93e2..93293d42f28d0514cddbf4f3030a6f5487be992c
+++ b/COPYING
 -GROMACS is free software, distributed under the GNU Lesser General
 -Public License (LGPL) Version 2.1. See section 1 for details. GROMACS
 -includes optional code covered by several different licences as
 -described below.  The GROMACS package in its entirety may be copied,
 -modified or distributed according to the conditions described in
 -section 1.  However, in the interests of clarity and completeness,
 -some individual parts of GROMACS that can be used under their
 -respective licenses are also noted here.
 -
 -This file contains the licenses for the following bodies of code:
 -1. GROMACS
 -2. Trajectory file reading using VMD plugins
 -3. Internal FFT (fftpack)
 -4. The memtestG80 library
 -5. thread_mpi
 -6. Blas
 -7. Lapack
 -
 -Our chosen method for packaging distributions (CPack) only permits a
 -package to have a single license file, so we are unfortunately forced
 -to combine all of this information into a single license file. Sorry
 -about that.
 -
 -============================================
 +GROMACS is free software, distributed under the GNU General Public License
 +(GPL) Version 2. See section 1 for details. GROMACS includes optional code
 +covered by several different licences as described below. The GROMACS
 +package in its interety has to be used, copied, and distributed under
 +the GPLv2 conditions. The individual parts can be used under their respictive
 +licenses.
 +
 +This file contains the licenses for:
 + 1. GROMACS
 + 2. Trajectory file reading using VMD plugins 
 + 3. Internal FFT (fftpack)
 + 4. The memtestG80 library
 + 5. thread_mpi
 + 6. Blas
 + 7. Lapack
-  8. OpenMM (binary distributions only)
-  9. Subset of Boost C++ library
- 10. Google Test and Google Mock
++ 8. Subset of Boost C++ library
++ 9. Google Test and Google Mock
  
  1. GROMACS
 -
 -As our use of the LGPL conveys upon a licensee the option to
 -redistribute the library under the terms of the plain GPL, we must
 -include a copy of that GPL for their reference. The applicable GPL
 -license comes after the applicable LGPL license in this file.
 -
 -============================================
 -
 -                  GNU LESSER GENERAL PUBLIC LICENSE
 -                       Version 2.1, February 1999
 -
 - Copyright (C) 1991, 1999 Free Software Foundation, Inc.
 - 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 - Everyone is permitted to copy and distribute verbatim copies
 - of this license document, but changing it is not allowed.
 -
 -[This is the first released version of the Lesser GPL.  It also counts
 - as the successor of the GNU Library Public License, version 2, hence
 - the version number 2.1.]
 -
 -                            Preamble
 -
 -  The licenses for most software are designed to take away your
 -freedom to share and change it.  By contrast, the GNU General Public
 -Licenses are intended to guarantee your freedom to share and change
 -free software--to make sure the software is free for all its users.
 -
 -  This license, the Lesser General Public License, applies to some
 -specially designated software packages--typically libraries--of the
 -Free Software Foundation and other authors who decide to use it.  You
 -can use it too, but we suggest you first think carefully about whether
 -this license or the ordinary General Public License is the better
 -strategy to use in any particular case, based on the explanations below.
 -
 -  When we speak of free software, we are referring to freedom of use,
 -not price.  Our General Public Licenses are designed to make sure that
 -you have the freedom to distribute copies of free software (and charge
 -for this service if you wish); that you receive source code or can get
 -it if you want it; that you can change the software and use pieces of
 -it in new free programs; and that you are informed that you can do
 -these things.
 -
 -  To protect your rights, we need to make restrictions that forbid
 -distributors to deny you these rights or to ask you to surrender these
 -rights.  These restrictions translate to certain responsibilities for
 -you if you distribute copies of the library or if you modify it.
 -
 -  For example, if you distribute copies of the library, whether gratis
 -or for a fee, you must give the recipients all the rights that we gave
 -you.  You must make sure that they, too, receive or can get the source
 -code.  If you link other code with the library, you must provide
 -complete object files to the recipients, so that they can relink them
 -with the library after making changes to the library and recompiling
 -it.  And you must show them these terms so they know their rights.
 -
 -  We protect your rights with a two-step method: (1) we copyright the
 -library, and (2) we offer you this license, which gives you legal
 -permission to copy, distribute and/or modify the library.
 -
 -  To protect each distributor, we want to make it very clear that
 -there is no warranty for the free library.  Also, if the library is
 -modified by someone else and passed on, the recipients should know
 -that what they have is not the original version, so that the original
 -author's reputation will not be affected by problems that might be
 -introduced by others.
 -\f
 -  Finally, software patents pose a constant threat to the existence of
 -any free program.  We wish to make sure that a company cannot
 -effectively restrict the users of a free program by obtaining a
 -restrictive license from a patent holder.  Therefore, we insist that
 -any patent license obtained for a version of the library must be
 -consistent with the full freedom of use specified in this license.
 -
 -  Most GNU software, including some libraries, is covered by the
 -ordinary GNU General Public License.  This license, the GNU Lesser
 -General Public License, applies to certain designated libraries, and
 -is quite different from the ordinary General Public License.  We use
 -this license for certain libraries in order to permit linking those
 -libraries into non-free programs.
 -
 -  When a program is linked with a library, whether statically or using
 -a shared library, the combination of the two is legally speaking a
 -combined work, a derivative of the original library.  The ordinary
 -General Public License therefore permits such linking only if the
 -entire combination fits its criteria of freedom.  The Lesser General
 -Public License permits more lax criteria for linking other code with
 -the library.
 -
 -  We call this license the "Lesser" General Public License because it
 -does Less to protect the user's freedom than the ordinary General
 -Public License.  It also provides other free software developers Less
 -of an advantage over competing non-free programs.  These disadvantages
 -are the reason we use the ordinary General Public License for many
 -libraries.  However, the Lesser license provides advantages in certain
 -special circumstances.
 -
 -  For example, on rare occasions, there may be a special need to
 -encourage the widest possible use of a certain library, so that it becomes
 -a de-facto standard.  To achieve this, non-free programs must be
 -allowed to use the library.  A more frequent case is that a free
 -library does the same job as widely used non-free libraries.  In this
 -case, there is little to gain by limiting the free library to free
 -software only, so we use the Lesser General Public License.
 -
 -  In other cases, permission to use a particular library in non-free
 -programs enables a greater number of people to use a large body of
 -free software.  For example, permission to use the GNU C Library in
 -non-free programs enables many more people to use the whole GNU
 -operating system, as well as its variant, the GNU/Linux operating
 -system.
 -
 -  Although the Lesser General Public License is Less protective of the
 -users' freedom, it does ensure that the user of a program that is
 -linked with the Library has the freedom and the wherewithal to run
 -that program using a modified version of the Library.
 -
 -  The precise terms and conditions for copying, distribution and
 -modification follow.  Pay close attention to the difference between a
 -"work based on the library" and a "work that uses the library".  The
 -former contains code derived from the library, whereas the latter must
 -be combined with the library in order to run.
 -\f
 -                  GNU LESSER GENERAL PUBLIC LICENSE
 -   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 -
 -  0. This License Agreement applies to any software library or other
 -program which contains a notice placed by the copyright holder or
 -other authorized party saying it may be distributed under the terms of
 -this Lesser General Public License (also called "this License").
 -Each licensee is addressed as "you".
 -
 -  A "library" means a collection of software functions and/or data
 -prepared so as to be conveniently linked with application programs
 -(which use some of those functions and data) to form executables.
 -
 -  The "Library", below, refers to any such software library or work
 -which has been distributed under these terms.  A "work based on the
 -Library" means either the Library or any derivative work under
 -copyright law: that is to say, a work containing the Library or a
 -portion of it, either verbatim or with modifications and/or translated
 -straightforwardly into another language.  (Hereinafter, translation is
 -included without limitation in the term "modification".)
 -
 -  "Source code" for a work means the preferred form of the work for
 -making modifications to it.  For a library, complete source code means
 -all the source code for all modules it contains, plus any associated
 -interface definition files, plus the scripts used to control compilation
 -and installation of the library.
 -
 -  Activities other than copying, distribution and modification are not
 -covered by this License; they are outside its scope.  The act of
 -running a program using the Library is not restricted, and output from
 -such a program is covered only if its contents constitute a work based
 -on the Library (independent of the use of the Library in a tool for
 -writing it).  Whether that is true depends on what the Library does
 -and what the program that uses the Library does.
 -
 -  1. You may copy and distribute verbatim copies of the Library's
 -complete source code as you receive it, in any medium, provided that
 -you conspicuously and appropriately publish on each copy an
 -appropriate copyright notice and disclaimer of warranty; keep intact
 -all the notices that refer to this License and to the absence of any
 -warranty; and distribute a copy of this License along with the
 -Library.
 -
 -  You may charge a fee for the physical act of transferring a copy,
 -and you may at your option offer warranty protection in exchange for a
 -fee.
 -\f
 -  2. You may modify your copy or copies of the Library or any portion
 -of it, thus forming a work based on the Library, and copy and
 -distribute such modifications or work under the terms of Section 1
 -above, provided that you also meet all of these conditions:
 -
 -    a) The modified work must itself be a software library.
 -
 -    b) You must cause the files modified to carry prominent notices
 -    stating that you changed the files and the date of any change.
 -
 -    c) You must cause the whole of the work to be licensed at no
 -    charge to all third parties under the terms of this License.
 -
 -    d) If a facility in the modified Library refers to a function or a
 -    table of data to be supplied by an application program that uses
 -    the facility, other than as an argument passed when the facility
 -    is invoked, then you must make a good faith effort to ensure that,
 -    in the event an application does not supply such function or
 -    table, the facility still operates, and performs whatever part of
 -    its purpose remains meaningful.
 -
 -    (For example, a function in a library to compute square roots has
 -    a purpose that is entirely well-defined independent of the
 -    application.  Therefore, Subsection 2d requires that any
 -    application-supplied function or table used by this function must
 -    be optional: if the application does not supply it, the square
 -    root function must still compute square roots.)
 -
 -These requirements apply to the modified work as a whole.  If
 -identifiable sections of that work are not derived from the Library,
 -and can be reasonably considered independent and separate works in
 -themselves, then this License, and its terms, do not apply to those
 -sections when you distribute them as separate works.  But when you
 -distribute the same sections as part of a whole which is a work based
 -on the Library, the distribution of the whole must be on the terms of
 -this License, whose permissions for other licensees extend to the
 -entire whole, and thus to each and every part regardless of who wrote
 -it.
 -
 -Thus, it is not the intent of this section to claim rights or contest
 -your rights to work written entirely by you; rather, the intent is to
 -exercise the right to control the distribution of derivative or
 -collective works based on the Library.
 -
 -In addition, mere aggregation of another work not based on the Library
 -with the Library (or with a work based on the Library) on a volume of
 -a storage or distribution medium does not bring the other work under
 -the scope of this License.
 -
 -  3. You may opt to apply the terms of the ordinary GNU General Public
 -License instead of this License to a given copy of the Library.  To do
 -this, you must alter all the notices that refer to this License, so
 -that they refer to the ordinary GNU General Public License, version 2,
 -instead of to this License.  (If a newer version than version 2 of the
 -ordinary GNU General Public License has appeared, then you can specify
 -that version instead if you wish.)  Do not make any other change in
 -these notices.
 -\f
 -  Once this change is made in a given copy, it is irreversible for
 -that copy, so the ordinary GNU General Public License applies to all
 -subsequent copies and derivative works made from that copy.
 -
 -  This option is useful when you wish to copy part of the code of
 -the Library into a program that is not a library.
 -
 -  4. You may copy and distribute the Library (or a portion or
 -derivative of it, under Section 2) in object code or executable form
 -under the terms of Sections 1 and 2 above provided that you accompany
 -it with the complete corresponding machine-readable source code, which
 -must be distributed under the terms of Sections 1 and 2 above on a
 -medium customarily used for software interchange.
 -
 -  If distribution of object code is made by offering access to copy
 -from a designated place, then offering equivalent access to copy the
 -source code from the same place satisfies the requirement to
 -distribute the source code, even though third parties are not
 -compelled to copy the source along with the object code.
 -
 -  5. A program that contains no derivative of any portion of the
 -Library, but is designed to work with the Library by being compiled or
 -linked with it, is called a "work that uses the Library".  Such a
 -work, in isolation, is not a derivative work of the Library, and
 -therefore falls outside the scope of this License.
 -
 -  However, linking a "work that uses the Library" with the Library
 -creates an executable that is a derivative of the Library (because it
 -contains portions of the Library), rather than a "work that uses the
 -library".  The executable is therefore covered by this License.
 -Section 6 states terms for distribution of such executables.
 -
 -  When a "work that uses the Library" uses material from a header file
 -that is part of the Library, the object code for the work may be a
 -derivative work of the Library even though the source code is not.
 -Whether this is true is especially significant if the work can be
 -linked without the Library, or if the work is itself a library.  The
 -threshold for this to be true is not precisely defined by law.
 -
 -  If such an object file uses only numerical parameters, data
 -structure layouts and accessors, and small macros and small inline
 -functions (ten lines or less in length), then the use of the object
 -file is unrestricted, regardless of whether it is legally a derivative
 -work.  (Executables containing this object code plus portions of the
 -Library will still fall under Section 6.)
 -
 -  Otherwise, if the work is a derivative of the Library, you may
 -distribute the object code for the work under the terms of Section 6.
 -Any executables containing that work also fall under Section 6,
 -whether or not they are linked directly with the Library itself.
 -\f
 -  6. As an exception to the Sections above, you may also combine or
 -link a "work that uses the Library" with the Library to produce a
 -work containing portions of the Library, and distribute that work
 -under terms of your choice, provided that the terms permit
 -modification of the work for the customer's own use and reverse
 -engineering for debugging such modifications.
 -
 -  You must give prominent notice with each copy of the work that the
 -Library is used in it and that the Library and its use are covered by
 -this License.  You must supply a copy of this License.  If the work
 -during execution displays copyright notices, you must include the
 -copyright notice for the Library among them, as well as a reference
 -directing the user to the copy of this License.  Also, you must do one
 -of these things:
 -
 -    a) Accompany the work with the complete corresponding
 -    machine-readable source code for the Library including whatever
 -    changes were used in the work (which must be distributed under
 -    Sections 1 and 2 above); and, if the work is an executable linked
 -    with the Library, with the complete machine-readable "work that
 -    uses the Library", as object code and/or source code, so that the
 -    user can modify the Library and then relink to produce a modified
 -    executable containing the modified Library.  (It is understood
 -    that the user who changes the contents of definitions files in the
 -    Library will not necessarily be able to recompile the application
 -    to use the modified definitions.)
 -
 -    b) Use a suitable shared library mechanism for linking with the
 -    Library.  A suitable mechanism is one that (1) uses at run time a
 -    copy of the library already present on the user's computer system,
 -    rather than copying library functions into the executable, and (2)
 -    will operate properly with a modified version of the library, if
 -    the user installs one, as long as the modified version is
 -    interface-compatible with the version that the work was made with.
 -
 -    c) Accompany the work with a written offer, valid for at
 -    least three years, to give the same user the materials
 -    specified in Subsection 6a, above, for a charge no more
 -    than the cost of performing this distribution.
 -
 -    d) If distribution of the work is made by offering access to copy
 -    from a designated place, offer equivalent access to copy the above
 -    specified materials from the same place.
 -
 -    e) Verify that the user has already received a copy of these
 -    materials or that you have already sent this user a copy.
 -
 -  For an executable, the required form of the "work that uses the
 -Library" must include any data and utility programs needed for
 -reproducing the executable from it.  However, as a special exception,
 -the materials to be distributed need not include anything that is
 -normally distributed (in either source or binary form) with the major
 -components (compiler, kernel, and so on) of the operating system on
 -which the executable runs, unless that component itself accompanies
 -the executable.
 -
 -  It may happen that this requirement contradicts the license
 -restrictions of other proprietary libraries that do not normally
 -accompany the operating system.  Such a contradiction means you cannot
 -use both them and the Library together in an executable that you
 -distribute.
 -\f
 -  7. You may place library facilities that are a work based on the
 -Library side-by-side in a single library together with other library
 -facilities not covered by this License, and distribute such a combined
 -library, provided that the separate distribution of the work based on
 -the Library and of the other library facilities is otherwise
 -permitted, and provided that you do these two things:
 -
 -    a) Accompany the combined library with a copy of the same work
 -    based on the Library, uncombined with any other library
 -    facilities.  This must be distributed under the terms of the
 -    Sections above.
 -
 -    b) Give prominent notice with the combined library of the fact
 -    that part of it is a work based on the Library, and explaining
 -    where to find the accompanying uncombined form of the same work.
 -
 -  8. You may not copy, modify, sublicense, link with, or distribute
 -the Library except as expressly provided under this License.  Any
 -attempt otherwise to copy, modify, sublicense, link with, or
 -distribute the Library is void, and will automatically terminate your
 -rights under this License.  However, parties who have received copies,
 -or rights, from you under this License will not have their licenses
 -terminated so long as such parties remain in full compliance.
 -
 -  9. You are not required to accept this License, since you have not
 -signed it.  However, nothing else grants you permission to modify or
 -distribute the Library or its derivative works.  These actions are
 -prohibited by law if you do not accept this License.  Therefore, by
 -modifying or distributing the Library (or any work based on the
 -Library), you indicate your acceptance of this License to do so, and
 -all its terms and conditions for copying, distributing or modifying
 -the Library or works based on it.
 -
 -  10. Each time you redistribute the Library (or any work based on the
 -Library), the recipient automatically receives a license from the
 -original licensor to copy, distribute, link with or modify the Library
 -subject to these terms and conditions.  You may not impose any further
 -restrictions on the recipients' exercise of the rights granted herein.
 -You are not responsible for enforcing compliance by third parties with
 -this License.
 -\f
 -  11. If, as a consequence of a court judgment or allegation of patent
 -infringement or for any other reason (not limited to patent issues),
 -conditions are imposed on you (whether by court order, agreement or
 -otherwise) that contradict the conditions of this License, they do not
 -excuse you from the conditions of this License.  If you cannot
 -distribute so as to satisfy simultaneously your obligations under this
 -License and any other pertinent obligations, then as a consequence you
 -may not distribute the Library at all.  For example, if a patent
 -license would not permit royalty-free redistribution of the Library by
 -all those who receive copies directly or indirectly through you, then
 -the only way you could satisfy both it and this License would be to
 -refrain entirely from distribution of the Library.
 -
 -If any portion of this section is held invalid or unenforceable under any
 -particular circumstance, the balance of the section is intended to apply,
 -and the section as a whole is intended to apply in other circumstances.
 -
 -It is not the purpose of this section to induce you to infringe any
 -patents or other property right claims or to contest validity of any
 -such claims; this section has the sole purpose of protecting the
 -integrity of the free software distribution system which is
 -implemented by public license practices.  Many people have made
 -generous contributions to the wide range of software distributed
 -through that system in reliance on consistent application of that
 -system; it is up to the author/donor to decide if he or she is willing
 -to distribute software through any other system and a licensee cannot
 -impose that choice.
 -
 -This section is intended to make thoroughly clear what is believed to
 -be a consequence of the rest of this License.
 -
 -  12. If the distribution and/or use of the Library is restricted in
 -certain countries either by patents or by copyrighted interfaces, the
 -original copyright holder who places the Library under this License may add
 -an explicit geographical distribution limitation excluding those countries,
 -so that distribution is permitted only in or among countries not thus
 -excluded.  In such case, this License incorporates the limitation as if
 -written in the body of this License.
 -
 -  13. The Free Software Foundation may publish revised and/or new
 -versions of the Lesser General Public License from time to time.
 -Such new versions will be similar in spirit to the present version,
 -but may differ in detail to address new problems or concerns.
 -
 -Each version is given a distinguishing version number.  If the Library
 -specifies a version number of this License which applies to it and
 -"any later version", you have the option of following the terms and
 -conditions either of that version or of any later version published by
 -the Free Software Foundation.  If the Library does not specify a
 -license version number, you may choose any version ever published by
 -the Free Software Foundation.
 -\f
 -  14. If you wish to incorporate parts of the Library into other free
 -programs whose distribution conditions are incompatible with these,
 -write to the author to ask for permission.  For software which is
 -copyrighted by the Free Software Foundation, write to the Free
 -Software Foundation; we sometimes make exceptions for this.  Our
 -decision will be guided by the two goals of preserving the free status
 -of all derivatives of our free software and of promoting the sharing
 -and reuse of software generally.
 -
 -                            NO WARRANTY
 -
 -  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
 -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
 -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
 -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
 -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
 -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 -PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
 -LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
 -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
 -
 -  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
 -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
 -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
 -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
 -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
 -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
 -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
 -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
 -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 -DAMAGES.
 -
 -                     END OF TERMS AND CONDITIONS
 -\f
 -           How to Apply These Terms to Your New Libraries
 -
 -  If you develop a new library, and you want it to be of the greatest
 -possible use to the public, we recommend making it free software that
 -everyone can redistribute and change.  You can do so by permitting
 -redistribution under these terms (or, alternatively, under the terms of the
 -ordinary General Public License).
 -
 -  To apply these terms, attach the following notices to the library.  It is
 -safest to attach them to the start of each source file to most effectively
 -convey the exclusion of warranty; and each file should have at least the
 -"copyright" line and a pointer to where the full notice is found.
 -
 -    <one line to give the library's name and a brief idea of what it does.>
 -    Copyright (C) <year>  <name of author>
 -
 -    This library is free software; you can redistribute it and/or
 -    modify it under the terms of the GNU Lesser General Public
 -    License as published by the Free Software Foundation; either
 -    version 2.1 of the License, or (at your option) any later version.
 -
 -    This library is distributed in the hope that it will be useful,
 -    but WITHOUT ANY WARRANTY; without even the implied warranty of
 -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -    Lesser General Public License for more details.
 -
 -    You should have received a copy of the GNU Lesser General Public
 -    License along with this library; if not, write to the Free Software
 -    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 -
 -Also add information on how to contact you by electronic and paper mail.
 -
 -You should also get your employer (if you work as a programmer) or your
 -school, if any, to sign a "copyright disclaimer" for the library, if
 -necessary.  Here is a sample; alter the names:
 -
 -  Yoyodyne, Inc., hereby disclaims all copyright interest in the
 -  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
 -
 -  <signature of Ty Coon>, 1 April 1990
 -  Ty Coon, President of Vice
 -
 -That's all there is to it!
 -
 -============================================
 +====================================
  
                      GNU GENERAL PUBLIC LICENSE
                         Version 2, June 1991
@@@ -556,70 -1070,3 +555,65 @@@ in Gromacs (primarily full & sparse mat
  better idea to use the full reference implementation.
  
  Erik Lindahl, 2008-10-07.
- 8. OpenMM (binary distributions only)
- =====================================
 +
- There are several licenses which cover different parts of OpenMM as described
- in the file openmm/licenses/Licenses.txt accompanying the binary distribution.
- 9. Subset of Boost C++ library
 +
- 10. Google Test and Google Mock
++8. Subset of Boost C++ library
 +==============================
 +Files: src/external/boost/boost/*
 +
 +Boost Software License - Version 1.0 - August 17th, 2003
 +
 +Permission is hereby granted, free of charge, to any person or organization
 +obtaining a copy of the software and accompanying documentation covered by
 +this license (the "Software") to use, reproduce, display, distribute,
 +execute, and transmit the Software, and to prepare derivative works of the
 +Software, and to permit third-parties to whom the Software is furnished to
 +do so, all subject to the following:
 +
 +The copyright notices in the Software and this entire statement, including
 +the above license grant, this restriction and the following disclaimer,
 +must be included in all copies of the Software, in whole or in part, and
 +all derivative works of the Software, unless such copies or derivative
 +works are solely in the form of machine-executable object code generated by
 +a source language processor.
 +
 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 +DEALINGS IN THE SOFTWARE.
 +
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ 9. Google Test and Google Mock
 +===============================
 +Files: src/external/gmock-1.6.0/*
 +
 +Copyright 2008, Google Inc.
 +All rights reserved.
 +
 +Redistribution and use in source and binary forms, with or without
 +modification, are permitted provided that the following conditions are
 +met:
 +
 +    * Redistributions of source code must retain the above copyright
 +notice, this list of conditions and the following disclaimer.
 +    * Redistributions in binary form must reproduce the above
 +copyright notice, this list of conditions and the following disclaimer
 +in the documentation and/or other materials provided with the
 +distribution.
 +    * Neither the name of Google Inc. nor the names of its
 +contributors may be used to endorse or promote products derived from
 +this software without specific prior written permission.
 +
 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Simple merge
Simple merge
index d70020a9aa7379ec4c0f3d4b845a6369cdb328c7,01430fc0160ef4484fca068a30e250420d877db6..d7f88cc02528f9a39c4b8d4e798b9e28425ae6ac
@@@ -1,6 -1,46 +1,12 @@@
 -#
 -# This file is part of the GROMACS molecular simulation package.
 -#
 -# Copyright (c) 2012, by the GROMACS development team, led by
 -# David van der Spoel, Berk Hess, Erik Lindahl, and including many
 -# others, as listed in the AUTHORS file in the top-level source
 -# directory and at http://www.gromacs.org.
 -#
 -# GROMACS is free software; you can redistribute it and/or
 -# modify it under the terms of the GNU Lesser General Public License
 -# as published by the Free Software Foundation; either version 2.1
 -# of the License, or (at your option) any later version.
 -#
 -# GROMACS is distributed in the hope that it will be useful,
 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -# Lesser General Public License for more details.
 -#
 -# You should have received a copy of the GNU Lesser General Public
 -# License along with GROMACS; if not, see
 -# http://www.gnu.org/licenses, or write to the Free Software Foundation,
 -# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 -#
 -# If you want to redistribute modifications to GROMACS, please
 -# consider that scientific software is very special. Version
 -# control is crucial - bugs must be traceable. We will be happy to
 -# consider code for inclusion in the official distribution, but
 -# derived work must not be called official GROMACS. Details are found
 -# in the README & COPYING files - if they are missing, get the
 -# official version at http://www.gromacs.org.
 -#
 -# To help us fund GROMACS development, we humbly ask that you cite
 -# the research papers on the package. Check out http://www.gromacs.org.
 -#
  # This macro attempts to parse the version string of the C compiler in use.
- # Currently supported are only compilers that accept "-dumpversion" argument:
- # gcc, Intel Compiler (on Linux and Mac OS), Open64, EkoPath.
+ # With CMake 2.8.9 CMake provides a CMAKE_[C|CXX]_COMPILER_VERSION variable
+ # so we will use that if available.
+ #
+ # Currently supported are:
+ # - with cmake >2.8.8 all compilers supported by CMake
+ # - with cmake <=2.8.8: compilers that accept "-dumpversion" argument:
+ #   gcc, Intel Compiler (on Linux and Mac OS), Open64, EkoPath, clang
+ #   (and probably other gcc-compatible compilers).
  #
  # C_COMPILER_VERSION    - version string of the current C compiler (CMAKE_C_COMPILER)
  # CXX_COMPILER_VERSION  - version string of the current C++ compiler (CMAKE_CXX_COMPILER)
Simple merge
Simple merge
index b43b007dff191b8f90c63f07bcd0ed82730aa89f,6e3d99a73d6a25a95db37eb02566ec2bd18a1082..4ed8a48a09f64fe5a8b979d6a1ff59b3ac22b2bc
@@@ -1,7 -1,118 +1,84 @@@
 -#
 -# This file is part of the GROMACS molecular simulation package.
 -#
 -# Copyright (c) 2012, by the GROMACS development team, led by
 -# David van der Spoel, Berk Hess, Erik Lindahl, and including many
 -# others, as listed in the AUTHORS file in the top-level source
 -# directory and at http://www.gromacs.org.
 -#
 -# GROMACS is free software; you can redistribute it and/or
 -# modify it under the terms of the GNU Lesser General Public License
 -# as published by the Free Software Foundation; either version 2.1
 -# of the License, or (at your option) any later version.
 -#
 -# GROMACS is distributed in the hope that it will be useful,
 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -# Lesser General Public License for more details.
 -#
 -# You should have received a copy of the GNU Lesser General Public
 -# License along with GROMACS; if not, see
 -# http://www.gnu.org/licenses, or write to the Free Software Foundation,
 -# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 -#
 -# If you want to redistribute modifications to GROMACS, please
 -# consider that scientific software is very special. Version
 -# control is crucial - bugs must be traceable. We will be happy to
 -# consider code for inclusion in the official distribution, but
 -# derived work must not be called official GROMACS. Details are found
 -# in the README & COPYING files - if they are missing, get the
 -# official version at http://www.gromacs.org.
 -#
 -# To help us fund GROMACS development, we humbly ask that you cite
 -# the research papers on the package. Check out http://www.gromacs.org.
 -#
  set(CONTRIB_PROGRAMS 
       #add here any programs you want to compile
  )
  
+ # Uncomment the next line to build OpenMM:
+ #option(GMX_OPENMM "Accelerated execution on GPUs through the OpenMM library (no longer supported" ON)
+ # At run time, you may need to set the environment variable
+ # OPENMM_PLUGIN_DIR=PATH_TO_GROMACS/openmm/lib/plugins
+ # to make things work
+ if(GMX_OPENMM)
+     if(GMX_GPU)
+         message(FATAL_ERROR "The OpenMM build is not compatible with the native GPU build")
+     endif()
+     enable_language(CXX)
+     set (GMX_BINARY_SUFFIX "-openmm")
+     set (GMX_LIBS_SUFFIX "_openmm")
+ #######################################################################
+ # Check for options incompatible with OpenMM build                    #
+ #######################################################################
+     # we'll use the built-in fft to avoid unnecessary dependencies
+     string(TOUPPER ${GMX_FFT_LIBRARY} GMX_FFT_LIBRARY)
+     if(NOT ${GMX_FFT_LIBRARY} STREQUAL "FFTPACK")
+         message(STATUS "No external FFT libraries needed for the OpenMM build, switching to fftpack!")
+         set(GMX_FFT_LIBRARY "fftpack" CACHE STRING 
+               "No external FFT libraries needed for the OpenMM build, switching to  fftpack!" FORCE)
+     endif()
+     if(GMX_MPI)
+         message(FATAL_ERROR "The OpenMM build is not compatible with MPI!")
+     endif(GMX_MPI)
+     if(GMX_THREAD_MPI)
+         message(STATUS "Thread-MPI not compatible with OpenMM, disabled!")
+         set(GMX_THREAD_MPI OFF CACHE BOOL
+               "Thread-MPI not compatible with OpenMM build, disabled!" FORCE)
+     endif(GMX_THREAD_MPI)
+     if(GMX_OPENMP)
+         message(STATUS "OpenMP multithreading not compatible with OpenMM, disabled")
+         set(GMX_OPENMP OFF CACHE BOOL
+             "OpenMP multithreading not compatible with OpenMM, disabled!" FORCE)
+     endif()
+     if(GMX_SOFTWARE_INVSQRT)
+         set(GMX_SOFTWARE_INVSQRT OFF CACHE STRING 
+                 "The OpenMM build does not need GROMACS software 1/sqrt!" FORCE)
+     endif(GMX_SOFTWARE_INVSQRT)
+     string(TOUPPER ${GMX_CPU_ACCELERATION} GMX_CPU_ACCELERATION)
+     if(NOT GMX_CPU_ACCELERATION STREQUAL "NONE")
+         message(STATUS "Switching off CPU-based acceleration, the OpenMM build does not support/need any!")
+         set(GMX_CPU_ACCELERATION "None" CACHE STRING
+             "Switching off CPU-based acceleration, the OpenMM build does not support/need any!" FORCE)
+     endif()
+     if(GMX_FAHCORE)
+         message(FATAL_ERROR "The OpenMM build does not support FAH build!")
+     endif(GMX_FAHCORE)
+     if(GMX_DOUBLE)
+         message(FATAL_ERROR  "The OpenMM-build does not support double precision calculations!")
+     endif()
+     set(CUDA_BUILD_EMULATION OFF)
+     find_package(CUDA 3.1 REQUIRED)
+     add_definitions(-DGMX_OPENMM)
+     if(CMAKE_BUILD_TYPE STREQUAL "DEBUG")    
+         set(CUDA_VERBOSE_BUILD ON)
+     endif()
+     list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/src/contrib)
+     find_package(OpenMM) 
+     # mark as advanced the unused variables
+     mark_as_advanced(FORCE GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
+         GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
+ else(GMX_OPENMM)
+      mark_as_advanced(CLEAR GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
+         GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
+ endif(GMX_OPENMM)
  foreach(PROG ${CONTRIB_PROGRAMS})
          add_executable(${PROG} ${PROG}.c ${NGMX_COMMON_SOURCE})
        set_target_properties(${PROG} PROPERTIES OUTPUT_NAME "${PROG}${GMX_BINARY_SUFFIX}")
Simple merge
index 425cc60d89b7b24c083de6c8dded03726c38eeee,0000000000000000000000000000000000000000..773f8d587ce2d9e5a3f6fc19983ecf133cc57f8d
mode 100644,000000..100644
--- /dev/null
@@@ -1,103 -1,0 +1,103 @@@
-     if (GMX_OPENMM OR GMX_GPU)
 +set(LIBGROMACS_SOURCES)
 +
 +add_subdirectory(legacyheaders)
 +add_subdirectory(gmxlib)
 +add_subdirectory(mdlib)
 +add_subdirectory(gmxpreprocess)
 +add_subdirectory(analysisdata)
 +add_subdirectory(commandline)
 +add_subdirectory(linearalgebra)
 +add_subdirectory(onlinehelp)
 +add_subdirectory(options)
 +add_subdirectory(selection)
 +add_subdirectory(trajectoryanalysis)
 +add_subdirectory(utility)
 +
 +file(GLOB LIBGROMACS_HEADERS *.h)
 +install(FILES ${LIBGROMACS_HEADERS} DESTINATION ${INCL_INSTALL_DIR}/gromacs
 +        COMPONENT development)
 +
 +list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
 +
 +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/version.h.cmakein ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.h
 +    DESTINATION ${INCL_INSTALL_DIR}/gromacs
 +    COMPONENT development)
 +
 +# Add target that generates gitversion.c every time make is run
 +# if git version info is requested
 +# This code is here instead of utility/CMakeLists.txt because CMake
 +# ignores set_source_file_properties from subdirectories.
 +if (GMX_GIT_VERSION_INFO)
 +    set(GENERATED_VERSION_FILE ${CMAKE_CURRENT_BINARY_DIR}/utility/gitversion.c)
 +    add_custom_target(gmx_version ALL
 +            COMMAND ${CMAKE_COMMAND}
 +                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
 +                -D GIT_VERSION="${GIT_VERSION}"
 +                -D PROJECT_VERSION="${PROJECT_VERSION}"
 +                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
 +                -D VERSION_C_CMAKEIN="${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein"
 +                -D VERSION_C_OUT=${GENERATED_VERSION_FILE}
 +                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
 +            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
 +            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/utility/gitversion.c.cmakein
 +            COMMENT "Generating git version information")
 +    set_source_files_properties(${GENERATED_VERSION_FILE}
 +                                PROPERTIES GENERATED true)
 +    list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
 +endif()
 +
 +# apply gcc 4.4.x bug workaround
 +if(GMX_USE_GCC44_BUG_WORKAROUND)
 +   include(gmxGCC44O3BugWorkaround)
 +   gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
 +   gmx_apply_gcc44_bug_workaround("mdlib/force.c")
 +   gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
 +endif()
 +
 +add_library(libgromacs ${LIBGROMACS_SOURCES})
 +if (GMX_GIT_VERSION_INFO)
 +    add_dependencies(libgromacs gmx_version)
 +endif ()
 +
 +if(GMX_BUILD_OWN_FFTW)
 +    # This dependency has to be made here rather than the CMakeLists.txt that
 +    # does the FFTW build, because of the order in which
 +    # add_subdirectory() calls are made in the top-level CMakeLists.txt; the
 +    # md library target does not necessarily exist yet. Also enabling and
 +    # disabling GMX_BUILD_OWN_FFTW changes dependencies correctly.
 +    add_dependencies(libgromacs gmxfftw)
 +endif()
 +
 +target_link_libraries(libgromacs ${GMX_GPU_LIBRARIES}
 +                      ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES}
 +                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
 +set_target_properties(libgromacs PROPERTIES
 +                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
 +                      SOVERSION ${SOVERSION}
 +                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 +
 +install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 +
 +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
 +               ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
 +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
 +        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
 +        RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
 +        COMPONENT development)
 +
 +if (INSTALL_CUDART_LIB) #can be set manual by user
-         message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_OPENMM or GMX_GPU")
++    if (GMX_GPU)
 +        foreach(CUDA_LIB ${CUDA_LIBRARIES})
 +            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
 +            if(IS_CUDART) #libcuda should not be installed
 +                #install also name-links (linker uses those)
 +                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
 +                install(FILES ${CUDA_LIBS} DESTINATION
 +                    ${LIB_INSTALL_DIR} COMPONENT libraries)
 +            endif()
 +        endforeach()
 +    else()
++        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
 +    endif()
 +endif ()
index f426643ced340a9dd5ace20a83735a308f6394db,0000000000000000000000000000000000000000..405fa477c9497e9a70859ccfa4255481d7672b23
mode 100644,000000..100644
--- /dev/null
@@@ -1,658 -1,0 +1,646 @@@
- #ifndef F77_FUNC
- /*! \brief Macro for Fortran name-mangling
-  *
-  * Use Fortran name mangling from autoconf macros if defined, 
-  * or lowercase+underscore by default. Since there is no easy way to convert
-  * between lower and upper case in macros, you should call fortran routines
-  * as F77_FUNC(routine,ROUTINE)(param1,param2,...)
-  */
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +struct gmx_invsqrtdata 
 +{
 +  unsigned int    exptab[256];    /*!< Exponential lookup table */
 +  unsigned int    fracttab[4096]; /*!< Mantissa lookup table    */
 +};
 +
 +
 +struct gmx_invsqrtdata 
 +F77_FUNC(gmxinvsqrtdata,GMXINVSQRTDATA) = 
 +{ 
 +    /* data for exponent table - 256 floats */
 +    { 
 +        0x5f000000,0x5e800000,0x5e800000,0x5e000000,
 +        0x5e000000,0x5d800000,0x5d800000,0x5d000000,
 +        0x5d000000,0x5c800000,0x5c800000,0x5c000000,
 +        0x5c000000,0x5b800000,0x5b800000,0x5b000000,
 +        0x5b000000,0x5a800000,0x5a800000,0x5a000000,
 +        0x5a000000,0x59800000,0x59800000,0x59000000,
 +        0x59000000,0x58800000,0x58800000,0x58000000,
 +        0x58000000,0x57800000,0x57800000,0x57000000,
 +        0x57000000,0x56800000,0x56800000,0x56000000,
 +        0x56000000,0x55800000,0x55800000,0x55000000,
 +        0x55000000,0x54800000,0x54800000,0x54000000,
 +        0x54000000,0x53800000,0x53800000,0x53000000,
 +        0x53000000,0x52800000,0x52800000,0x52000000,
 +        0x52000000,0x51800000,0x51800000,0x51000000,
 +        0x51000000,0x50800000,0x50800000,0x50000000,
 +        0x50000000,0x4f800000,0x4f800000,0x4f000000,
 +        0x4f000000,0x4e800000,0x4e800000,0x4e000000,
 +        0x4e000000,0x4d800000,0x4d800000,0x4d000000,
 +        0x4d000000,0x4c800000,0x4c800000,0x4c000000,
 +        0x4c000000,0x4b800000,0x4b800000,0x4b000000,
 +        0x4b000000,0x4a800000,0x4a800000,0x4a000000,
 +        0x4a000000,0x49800000,0x49800000,0x49000000,
 +        0x49000000,0x48800000,0x48800000,0x48000000,
 +        0x48000000,0x47800000,0x47800000,0x47000000,
 +        0x47000000,0x46800000,0x46800000,0x46000000,
 +        0x46000000,0x45800000,0x45800000,0x45000000,
 +        0x45000000,0x44800000,0x44800000,0x44000000,
 +        0x44000000,0x43800000,0x43800000,0x43000000,
 +        0x43000000,0x42800000,0x42800000,0x42000000,
 +        0x42000000,0x41800000,0x41800000,0x41000000,
 +        0x41000000,0x40800000,0x40800000,0x40000000,
 +        0x40000000,0x3f800000,0x3f800000,0x3f000000,
 +        0x3f000000,0x3e800000,0x3e800000,0x3e000000,
 +        0x3e000000,0x3d800000,0x3d800000,0x3d000000,
 +        0x3d000000,0x3c800000,0x3c800000,0x3c000000,
 +        0x3c000000,0x3b800000,0x3b800000,0x3b000000,
 +        0x3b000000,0x3a800000,0x3a800000,0x3a000000,
 +        0x3a000000,0x39800000,0x39800000,0x39000000,
 +        0x39000000,0x38800000,0x38800000,0x38000000,
 +        0x38000000,0x37800000,0x37800000,0x37000000,
 +        0x37000000,0x36800000,0x36800000,0x36000000,
 +        0x36000000,0x35800000,0x35800000,0x35000000,
 +        0x35000000,0x34800000,0x34800000,0x34000000,
 +        0x34000000,0x33800000,0x33800000,0x33000000,
 +        0x33000000,0x32800000,0x32800000,0x32000000,
 +        0x32000000,0x31800000,0x31800000,0x31000000,
 +        0x31000000,0x30800000,0x30800000,0x30000000,
 +        0x30000000,0x2f800000,0x2f800000,0x2f000000,
 +        0x2f000000,0x2e800000,0x2e800000,0x2e000000,
 +        0x2e000000,0x2d800000,0x2d800000,0x2d000000,
 +        0x2d000000,0x2c800000,0x2c800000,0x2c000000,
 +        0x2c000000,0x2b800000,0x2b800000,0x2b000000,
 +        0x2b000000,0x2a800000,0x2a800000,0x2a000000,
 +        0x2a000000,0x29800000,0x29800000,0x29000000,
 +        0x29000000,0x28800000,0x28800000,0x28000000,
 +        0x28000000,0x27800000,0x27800000,0x27000000,
 +        0x27000000,0x26800000,0x26800000,0x26000000,
 +        0x26000000,0x25800000,0x25800000,0x25000000,
 +        0x25000000,0x24800000,0x24800000,0x24000000,
 +        0x24000000,0x23800000,0x23800000,0x23000000,
 +        0x23000000,0x22800000,0x22800000,0x22000000,
 +        0x22000000,0x21800000,0x21800000,0x21000000,
 +        0x21000000,0x20800000,0x20800000,0x20000000,
 +        0x20000000,0x1f800000,0x1f800000,0x1f000000 
 +    } ,
 +    /* data for fraction table - 4096 floats */
 +    {
 +        0x3504f3,0x34f9a4,0x34ee57,0x34e30c,0x34d7c3,0x34cc7c,0x34c137,0x34b5f5,
 +        0x34aab4,0x349f76,0x34943a,0x348900,0x347dc7,0x347291,0x34675e,0x345c2c,
 +        0x3450fc,0x3445ce,0x343aa3,0x342f79,0x342452,0x34192c,0x340e09,0x3402e8,
 +        0x33f7c9,0x33ecac,0x33e191,0x33d678,0x33cb61,0x33c04c,0x33b539,0x33aa28,
 +        0x339f19,0x33940d,0x338902,0x337df9,0x3372f3,0x3367ee,0x335cec,0x3351eb,
 +        0x3346ed,0x333bf0,0x3330f6,0x3325fd,0x331b07,0x331013,0x330520,0x32fa30,
 +        0x32ef41,0x32e455,0x32d96b,0x32ce82,0x32c39c,0x32b8b7,0x32add5,0x32a2f5,
 +        0x329816,0x328d3a,0x32825f,0x327787,0x326cb0,0x3261dc,0x325709,0x324c38,
 +        0x32416a,0x32369d,0x322bd2,0x32210a,0x321643,0x320b7e,0x3200bb,0x31f5fa,
 +        0x31eb3b,0x31e07e,0x31d5c3,0x31cb0a,0x31c053,0x31b59d,0x31aaea,0x31a038,
 +        0x319589,0x318adb,0x318030,0x317586,0x316ade,0x316038,0x315594,0x314af2,
 +        0x314052,0x3135b4,0x312b18,0x31207d,0x3115e5,0x310b4e,0x3100b9,0x30f627,
 +        0x30eb96,0x30e107,0x30d67a,0x30cbee,0x30c165,0x30b6dd,0x30ac58,0x30a1d4,
 +        0x309752,0x308cd2,0x308254,0x3077d8,0x306d5e,0x3062e5,0x30586e,0x304dfa,
 +        0x304387,0x303916,0x302ea7,0x302439,0x3019ce,0x300f64,0x3004fc,0x2ffa96,
 +        0x2ff032,0x2fe5d0,0x2fdb6f,0x2fd111,0x2fc6b4,0x2fbc59,0x2fb200,0x2fa7a9,
 +        0x2f9d53,0x2f9300,0x2f88ae,0x2f7e5e,0x2f7410,0x2f69c3,0x2f5f79,0x2f5530,
 +        0x2f4ae9,0x2f40a4,0x2f3661,0x2f2c1f,0x2f21df,0x2f17a1,0x2f0d65,0x2f032b,
 +        0x2ef8f2,0x2eeebc,0x2ee487,0x2eda53,0x2ed022,0x2ec5f2,0x2ebbc5,0x2eb199,
 +        0x2ea76e,0x2e9d46,0x2e931f,0x2e88fa,0x2e7ed7,0x2e74b5,0x2e6a96,0x2e6078,
 +        0x2e565c,0x2e4c41,0x2e4229,0x2e3812,0x2e2dfd,0x2e23e9,0x2e19d8,0x2e0fc8,
 +        0x2e05ba,0x2dfbad,0x2df1a3,0x2de79a,0x2ddd93,0x2dd38d,0x2dc989,0x2dbf87,
 +        0x2db587,0x2dab89,0x2da18c,0x2d9791,0x2d8d97,0x2d83a0,0x2d79aa,0x2d6fb6,
 +        0x2d65c3,0x2d5bd2,0x2d51e3,0x2d47f6,0x2d3e0a,0x2d3420,0x2d2a38,0x2d2051,
 +        0x2d166c,0x2d0c89,0x2d02a8,0x2cf8c8,0x2ceeea,0x2ce50d,0x2cdb33,0x2cd15a,
 +        0x2cc782,0x2cbdad,0x2cb3d9,0x2caa06,0x2ca036,0x2c9667,0x2c8c99,0x2c82ce,
 +        0x2c7904,0x2c6f3b,0x2c6575,0x2c5bb0,0x2c51ed,0x2c482b,0x2c3e6b,0x2c34ad,
 +        0x2c2af0,0x2c2135,0x2c177b,0x2c0dc4,0x2c040e,0x2bfa59,0x2bf0a6,0x2be6f5,
 +        0x2bdd46,0x2bd398,0x2bc9eb,0x2bc041,0x2bb698,0x2bacf0,0x2ba34b,0x2b99a6,
 +        0x2b9004,0x2b8663,0x2b7cc4,0x2b7326,0x2b698a,0x2b5ff0,0x2b5657,0x2b4cc0,
 +        0x2b432a,0x2b3996,0x2b3004,0x2b2673,0x2b1ce4,0x2b1357,0x2b09cb,0x2b0040,
 +        0x2af6b7,0x2aed30,0x2ae3ab,0x2ada27,0x2ad0a4,0x2ac724,0x2abda4,0x2ab427,
 +        0x2aaaab,0x2aa130,0x2a97b7,0x2a8e40,0x2a84ca,0x2a7b56,0x2a71e3,0x2a6872,
 +        0x2a5f03,0x2a5595,0x2a4c29,0x2a42be,0x2a3955,0x2a2fed,0x2a2687,0x2a1d23,
 +        0x2a13c0,0x2a0a5e,0x2a00fe,0x29f7a0,0x29ee43,0x29e4e8,0x29db8e,0x29d236,
 +        0x29c8e0,0x29bf8b,0x29b637,0x29ace5,0x29a395,0x299a46,0x2990f8,0x2987ad,
 +        0x297e62,0x297519,0x296bd2,0x29628c,0x295948,0x295005,0x2946c4,0x293d85,
 +        0x293446,0x292b0a,0x2921cf,0x291895,0x290f5d,0x290626,0x28fcf1,0x28f3be,
 +        0x28ea8c,0x28e15b,0x28d82c,0x28cefe,0x28c5d2,0x28bca8,0x28b37f,0x28aa57,
 +        0x28a131,0x28980c,0x288ee9,0x2885c7,0x287ca7,0x287389,0x286a6b,0x286150,
 +        0x285835,0x284f1c,0x284605,0x283cef,0x2833db,0x282ac8,0x2821b7,0x2818a7,
 +        0x280f98,0x28068b,0x27fd80,0x27f475,0x27eb6d,0x27e266,0x27d960,0x27d05c,
 +        0x27c759,0x27be57,0x27b557,0x27ac59,0x27a35c,0x279a60,0x279166,0x27886d,
 +        0x277f76,0x277680,0x276d8c,0x276499,0x275ba7,0x2752b7,0x2749c9,0x2740db,
 +        0x2737f0,0x272f05,0x27261c,0x271d35,0x27144f,0x270b6a,0x270287,0x26f9a5,
 +        0x26f0c4,0x26e7e5,0x26df08,0x26d62c,0x26cd51,0x26c477,0x26bba0,0x26b2c9,
 +        0x26a9f4,0x26a120,0x26984e,0x268f7d,0x2686ad,0x267ddf,0x267512,0x266c47,
 +        0x26637d,0x265ab4,0x2651ed,0x264927,0x264063,0x2637a0,0x262ede,0x26261e,
 +        0x261d5f,0x2614a2,0x260be6,0x26032b,0x25fa72,0x25f1ba,0x25e903,0x25e04e,
 +        0x25d79a,0x25cee7,0x25c636,0x25bd87,0x25b4d8,0x25ac2b,0x25a37f,0x259ad5,
 +        0x25922c,0x258985,0x2580de,0x257839,0x256f96,0x2566f4,0x255e53,0x2555b3,
 +        0x254d15,0x254479,0x253bdd,0x253343,0x252aaa,0x252213,0x25197d,0x2510e8,
 +        0x250855,0x24ffc3,0x24f732,0x24eea3,0x24e615,0x24dd88,0x24d4fc,0x24cc72,
 +        0x24c3ea,0x24bb62,0x24b2dc,0x24aa57,0x24a1d4,0x249952,0x2490d1,0x248852,
 +        0x247fd3,0x247756,0x246edb,0x246661,0x245de8,0x245570,0x244cfa,0x244485,
 +        0x243c11,0x24339f,0x242b2e,0x2422be,0x241a4f,0x2411e2,0x240976,0x24010c,
 +        0x23f8a2,0x23f03a,0x23e7d4,0x23df6e,0x23d70a,0x23cea7,0x23c646,0x23bde6,
 +        0x23b587,0x23ad29,0x23a4cc,0x239c71,0x239417,0x238bbf,0x238368,0x237b12,
 +        0x2372bd,0x236a69,0x236217,0x2359c6,0x235177,0x234928,0x2340db,0x23388f,
 +        0x233045,0x2327fb,0x231fb3,0x23176c,0x230f27,0x2306e2,0x22fe9f,0x22f65e,
 +        0x22ee1d,0x22e5de,0x22dda0,0x22d563,0x22cd28,0x22c4ed,0x22bcb4,0x22b47c,
 +        0x22ac46,0x22a411,0x229bdd,0x2293aa,0x228b78,0x228348,0x227b19,0x2272eb,
 +        0x226abe,0x226293,0x225a69,0x225240,0x224a18,0x2241f2,0x2239cc,0x2231a8,
 +        0x222985,0x222164,0x221944,0x221124,0x220907,0x2200ea,0x21f8ce,0x21f0b4,
 +        0x21e89b,0x21e083,0x21d86d,0x21d057,0x21c843,0x21c030,0x21b81e,0x21b00e,
 +        0x21a7fe,0x219ff0,0x2197e3,0x218fd8,0x2187cd,0x217fc4,0x2177bc,0x216fb5,
 +        0x2167af,0x215faa,0x2157a7,0x214fa5,0x2147a4,0x213fa4,0x2137a5,0x212fa8,
 +        0x2127ac,0x211fb1,0x2117b7,0x210fbe,0x2107c7,0x20ffd0,0x20f7db,0x20efe7,
 +        0x20e7f5,0x20e003,0x20d813,0x20d023,0x20c835,0x20c048,0x20b85d,0x20b072,
 +        0x20a889,0x20a0a1,0x2098ba,0x2090d4,0x2088ef,0x20810b,0x207929,0x207148,
 +        0x206968,0x206189,0x2059ab,0x2051cf,0x2049f3,0x204219,0x203a40,0x203268,
 +        0x202a91,0x2022bb,0x201ae7,0x201313,0x200b41,0x200370,0x1ffba0,0x1ff3d1,
 +        0x1fec04,0x1fe437,0x1fdc6c,0x1fd4a2,0x1fccd9,0x1fc511,0x1fbd4a,0x1fb584,
 +        0x1fadc0,0x1fa5fc,0x1f9e3a,0x1f9679,0x1f8eb9,0x1f86fa,0x1f7f3c,0x1f777f,
 +        0x1f6fc4,0x1f680a,0x1f6050,0x1f5898,0x1f50e1,0x1f492b,0x1f4176,0x1f39c3,
 +        0x1f3210,0x1f2a5f,0x1f22af,0x1f1aff,0x1f1351,0x1f0ba4,0x1f03f8,0x1efc4e,
 +        0x1ef4a4,0x1eecfb,0x1ee554,0x1eddae,0x1ed608,0x1ece64,0x1ec6c1,0x1ebf1f,
 +        0x1eb77f,0x1eafdf,0x1ea840,0x1ea0a3,0x1e9906,0x1e916b,0x1e89d1,0x1e8238,
 +        0x1e7aa0,0x1e7309,0x1e6b73,0x1e63de,0x1e5c4a,0x1e54b8,0x1e4d26,0x1e4596,
 +        0x1e3e06,0x1e3678,0x1e2eeb,0x1e275f,0x1e1fd4,0x1e184a,0x1e10c1,0x1e0939,
 +        0x1e01b3,0x1dfa2d,0x1df2a8,0x1deb25,0x1de3a2,0x1ddc21,0x1dd4a1,0x1dcd22,
 +        0x1dc5a3,0x1dbe26,0x1db6aa,0x1daf2f,0x1da7b6,0x1da03d,0x1d98c5,0x1d914e,
 +        0x1d89d9,0x1d8264,0x1d7af1,0x1d737e,0x1d6c0d,0x1d649c,0x1d5d2d,0x1d55bf,
 +        0x1d4e52,0x1d46e5,0x1d3f7a,0x1d3810,0x1d30a7,0x1d293f,0x1d21d8,0x1d1a73,
 +        0x1d130e,0x1d0baa,0x1d0447,0x1cfce6,0x1cf585,0x1cee25,0x1ce6c7,0x1cdf69,
 +        0x1cd80d,0x1cd0b1,0x1cc957,0x1cc1fe,0x1cbaa5,0x1cb34e,0x1cabf8,0x1ca4a2,
 +        0x1c9d4e,0x1c95fb,0x1c8ea9,0x1c8758,0x1c8008,0x1c78b8,0x1c716a,0x1c6a1d,
 +        0x1c62d1,0x1c5b86,0x1c543c,0x1c4cf3,0x1c45ab,0x1c3e65,0x1c371f,0x1c2fda,
 +        0x1c2896,0x1c2153,0x1c1a11,0x1c12d0,0x1c0b90,0x1c0452,0x1bfd14,0x1bf5d7,
 +        0x1bee9b,0x1be760,0x1be027,0x1bd8ee,0x1bd1b6,0x1bca7f,0x1bc349,0x1bbc15,
 +        0x1bb4e1,0x1badae,0x1ba67c,0x1b9f4c,0x1b981c,0x1b90ed,0x1b89bf,0x1b8292,
 +        0x1b7b67,0x1b743c,0x1b6d12,0x1b65e9,0x1b5ec1,0x1b579a,0x1b5074,0x1b4950,
 +        0x1b422c,0x1b3b09,0x1b33e7,0x1b2cc6,0x1b25a6,0x1b1e87,0x1b1769,0x1b104c,
 +        0x1b0930,0x1b0215,0x1afafb,0x1af3e2,0x1aecc9,0x1ae5b2,0x1ade9c,0x1ad787,
 +        0x1ad073,0x1ac95f,0x1ac24d,0x1abb3c,0x1ab42b,0x1aad1c,0x1aa60d,0x1a9f00,
 +        0x1a97f3,0x1a90e8,0x1a89dd,0x1a82d4,0x1a7bcb,0x1a74c3,0x1a6dbd,0x1a66b7,
 +        0x1a5fb2,0x1a58ae,0x1a51ab,0x1a4aa9,0x1a43a8,0x1a3ca8,0x1a35a9,0x1a2eab,
 +        0x1a27ae,0x1a20b1,0x1a19b6,0x1a12bc,0x1a0bc2,0x1a04ca,0x19fdd2,0x19f6dc,
 +        0x19efe6,0x19e8f2,0x19e1fe,0x19db0b,0x19d419,0x19cd28,0x19c638,0x19bf49,
 +        0x19b85b,0x19b16e,0x19aa82,0x19a396,0x199cac,0x1995c3,0x198eda,0x1987f3,
 +        0x19810c,0x197a26,0x197342,0x196c5e,0x19657b,0x195e99,0x1957b8,0x1950d8,
 +        0x1949f8,0x19431a,0x193c3d,0x193560,0x192e85,0x1927aa,0x1920d1,0x1919f8,
 +        0x191320,0x190c49,0x190573,0x18fe9e,0x18f7ca,0x18f0f7,0x18ea24,0x18e353,
 +        0x18dc82,0x18d5b3,0x18cee4,0x18c816,0x18c149,0x18ba7d,0x18b3b2,0x18ace8,
 +        0x18a61f,0x189f56,0x18988f,0x1891c8,0x188b03,0x18843e,0x187d7a,0x1876b7,
 +        0x186ff5,0x186934,0x186274,0x185bb4,0x1854f6,0x184e38,0x18477c,0x1840c0,
 +        0x183a05,0x18334b,0x182c92,0x1825da,0x181f23,0x18186c,0x1811b7,0x180b02,
 +        0x18044e,0x17fd9b,0x17f6e9,0x17f038,0x17e988,0x17e2d9,0x17dc2a,0x17d57d,
 +        0x17ced0,0x17c824,0x17c179,0x17bacf,0x17b426,0x17ad7e,0x17a6d6,0x17a030,
 +        0x17998a,0x1792e5,0x178c41,0x17859e,0x177efc,0x17785b,0x1771ba,0x176b1b,
 +        0x17647c,0x175dde,0x175741,0x1750a5,0x174a0a,0x17436f,0x173cd6,0x17363d,
 +        0x172fa5,0x17290f,0x172278,0x171be3,0x17154f,0x170ebb,0x170829,0x170197,
 +        0x16fb06,0x16f476,0x16ede7,0x16e759,0x16e0cb,0x16da3e,0x16d3b3,0x16cd28,
 +        0x16c69e,0x16c014,0x16b98c,0x16b305,0x16ac7e,0x16a5f8,0x169f73,0x1698ef,
 +        0x16926c,0x168be9,0x168568,0x167ee7,0x167867,0x1671e8,0x166b6a,0x1664ec,
 +        0x165e70,0x1657f4,0x165179,0x164aff,0x164486,0x163e0d,0x163796,0x16311f,
 +        0x162aa9,0x162434,0x161dc0,0x16174d,0x1610da,0x160a68,0x1603f8,0x15fd88,
 +        0x15f718,0x15f0aa,0x15ea3c,0x15e3d0,0x15dd64,0x15d6f9,0x15d08e,0x15ca25,
 +        0x15c3bc,0x15bd55,0x15b6ee,0x15b087,0x15aa22,0x15a3be,0x159d5a,0x1596f7,
 +        0x159095,0x158a34,0x1583d3,0x157d74,0x157715,0x1570b7,0x156a5a,0x1563fd,
 +        0x155da2,0x155747,0x1550ed,0x154a94,0x15443c,0x153de4,0x15378e,0x153138,
 +        0x152ae3,0x15248e,0x151e3b,0x1517e8,0x151197,0x150b45,0x1504f5,0x14fea6,
 +        0x14f857,0x14f209,0x14ebbc,0x14e570,0x14df25,0x14d8da,0x14d290,0x14cc47,
 +        0x14c5ff,0x14bfb7,0x14b971,0x14b32b,0x14ace6,0x14a6a1,0x14a05e,0x149a1b,
 +        0x1493d9,0x148d98,0x148758,0x148118,0x147ada,0x14749c,0x146e5f,0x146822,
 +        0x1461e7,0x145bac,0x145572,0x144f38,0x144900,0x1442c8,0x143c91,0x14365b,
 +        0x143026,0x1429f1,0x1423be,0x141d8b,0x141758,0x141127,0x140af6,0x1404c6,
 +        0x13fe97,0x13f869,0x13f23b,0x13ec0f,0x13e5e3,0x13dfb7,0x13d98d,0x13d363,
 +        0x13cd3a,0x13c712,0x13c0eb,0x13bac4,0x13b49e,0x13ae79,0x13a855,0x13a231,
 +        0x139c0e,0x1395ec,0x138fcb,0x1389ab,0x13838b,0x137d6c,0x13774e,0x137130,
 +        0x136b13,0x1364f8,0x135edc,0x1358c2,0x1352a8,0x134c8f,0x134677,0x134060,
 +        0x133a49,0x133433,0x132e1e,0x13280a,0x1321f6,0x131be3,0x1315d1,0x130fc0,
 +        0x1309af,0x13039f,0x12fd90,0x12f782,0x12f174,0x12eb67,0x12e55b,0x12df50,
 +        0x12d945,0x12d33b,0x12cd32,0x12c72a,0x12c122,0x12bb1b,0x12b515,0x12af10,
 +        0x12a90b,0x12a307,0x129d04,0x129702,0x129100,0x128aff,0x1284ff,0x127eff,
 +        0x127900,0x127302,0x126d05,0x126708,0x12610d,0x125b11,0x125517,0x124f1d,
 +        0x124925,0x12432c,0x123d35,0x12373e,0x123148,0x122b53,0x12255e,0x121f6b,
 +        0x121978,0x121385,0x120d94,0x1207a3,0x1201b3,0x11fbc3,0x11f5d4,0x11efe6,
 +        0x11e9f9,0x11e40d,0x11de21,0x11d836,0x11d24b,0x11cc62,0x11c679,0x11c090,
 +        0x11baa9,0x11b4c2,0x11aedc,0x11a8f7,0x11a312,0x119d2e,0x11974b,0x119168,
 +        0x118b87,0x1185a6,0x117fc5,0x1179e5,0x117407,0x116e28,0x11684b,0x11626e,
 +        0x115c92,0x1156b6,0x1150dc,0x114b02,0x114529,0x113f50,0x113978,0x1133a1,
 +        0x112dca,0x1127f5,0x112220,0x111c4b,0x111678,0x1110a5,0x110ad3,0x110501,
 +        0x10ff30,0x10f960,0x10f391,0x10edc2,0x10e7f4,0x10e226,0x10dc5a,0x10d68e,
 +        0x10d0c3,0x10caf8,0x10c52e,0x10bf65,0x10b99c,0x10b3d5,0x10ae0e,0x10a847,
 +        0x10a281,0x109cbc,0x1096f8,0x109134,0x108b72,0x1085af,0x107fee,0x107a2d,
 +        0x10746d,0x106ead,0x1068ee,0x106330,0x105d73,0x1057b6,0x1051fa,0x104c3e,
 +        0x104684,0x1040ca,0x103b10,0x103558,0x102fa0,0x1029e8,0x102432,0x101e7c,
 +        0x1018c6,0x101312,0x100d5e,0x1007ab,0x1001f8,0xffc46,0xff695,0xff0e4,
 +        0xfeb35,0xfe585,0xfdfd7,0xfda29,0xfd47c,0xfcecf,0xfc923,0xfc378,
 +        0xfbdce,0xfb824,0xfb27b,0xfacd2,0xfa72a,0xfa183,0xf9bdd,0xf9637,
 +        0xf9092,0xf8aed,0xf854a,0xf7fa6,0xf7a04,0xf7462,0xf6ec1,0xf6920,
 +        0xf6381,0xf5de1,0xf5843,0xf52a5,0xf4d08,0xf476b,0xf41cf,0xf3c34,
 +        0xf369a,0xf3100,0xf2b66,0xf25ce,0xf2036,0xf1a9f,0xf1508,0xf0f72,
 +        0xf09dd,0xf0448,0xefeb4,0xef921,0xef38e,0xeedfc,0xee86b,0xee2da,
 +        0xedd4a,0xed7ba,0xed22b,0xecc9d,0xec710,0xec183,0xebbf7,0xeb66b,
 +        0xeb0e0,0xeab56,0xea5cc,0xea043,0xe9abb,0xe9533,0xe8fac,0xe8a26,
 +        0xe84a0,0xe7f1b,0xe7996,0xe7413,0xe6e8f,0xe690d,0xe638b,0xe5e0a,
 +        0xe5889,0xe5309,0xe4d8a,0xe480b,0xe428d,0xe3d0f,0xe3792,0xe3216,
 +        0xe2c9b,0xe2720,0xe21a5,0xe1c2c,0xe16b3,0xe113a,0xe0bc3,0xe064c,
 +        0xe00d5,0xdfb5f,0xdf5ea,0xdf075,0xdeb01,0xde58e,0xde01b,0xddaa9,
 +        0xdd538,0xdcfc7,0xdca57,0xdc4e7,0xdbf78,0xdba0a,0xdb49c,0xdaf2f,
 +        0xda9c2,0xda457,0xd9eeb,0xd9981,0xd9417,0xd8ead,0xd8945,0xd83dc,
 +        0xd7e75,0xd790e,0xd73a8,0xd6e42,0xd68dd,0xd6379,0xd5e15,0xd58b2,
 +        0xd534f,0xd4ded,0xd488c,0xd432b,0xd3dcb,0xd386c,0xd330d,0xd2dae,
 +        0xd2851,0xd22f4,0xd1d97,0xd183b,0xd12e0,0xd0d86,0xd082c,0xd02d2,
 +        0xcfd79,0xcf821,0xcf2ca,0xced73,0xce81c,0xce2c7,0xcdd72,0xcd81d,
 +        0xcd2c9,0xccd76,0xcc823,0xcc2d1,0xcbd7f,0xcb82f,0xcb2de,0xcad8f,
 +        0xca83f,0xca2f1,0xc9da3,0xc9856,0xc9309,0xc8dbd,0xc8871,0xc8326,
 +        0xc7ddc,0xc7892,0xc7349,0xc6e01,0xc68b9,0xc6372,0xc5e2b,0xc58e5,
 +        0xc539f,0xc4e5a,0xc4916,0xc43d2,0xc3e8f,0xc394c,0xc340a,0xc2ec9,
 +        0xc2988,0xc2448,0xc1f08,0xc19c9,0xc148b,0xc0f4d,0xc0a10,0xc04d3,
 +        0xbff97,0xbfa5b,0xbf521,0xbefe6,0xbeaad,0xbe573,0xbe03b,0xbdb03,
 +        0xbd5cb,0xbd095,0xbcb5e,0xbc629,0xbc0f4,0xbbbbf,0xbb68b,0xbb158,
 +        0xbac25,0xba6f3,0xba1c1,0xb9c90,0xb9760,0xb9230,0xb8d01,0xb87d2,
 +        0xb82a4,0xb7d76,0xb7849,0xb731d,0xb6df1,0xb68c6,0xb639b,0xb5e71,
 +        0xb5948,0xb541f,0xb4ef6,0xb49cf,0xb44a7,0xb3f81,0xb3a5b,0xb3535,
 +        0xb3010,0xb2aec,0xb25c8,0xb20a5,0xb1b82,0xb1660,0xb113e,0xb0c1d,
 +        0xb06fd,0xb01dd,0xafcbe,0xaf79f,0xaf281,0xaed64,0xae847,0xae32a,
 +        0xade0e,0xad8f3,0xad3d8,0xacebe,0xac9a4,0xac48b,0xabf73,0xaba5b,
 +        0xab544,0xab02d,0xaab17,0xaa601,0xaa0ec,0xa9bd7,0xa96c3,0xa91b0,
 +        0xa8c9d,0xa878a,0xa8279,0xa7d67,0xa7857,0xa7347,0xa6e37,0xa6928,
 +        0xa641a,0xa5f0c,0xa59fe,0xa54f2,0xa4fe5,0xa4ada,0xa45ce,0xa40c4,
 +        0xa3bba,0xa36b0,0xa31a7,0xa2c9f,0xa2797,0xa2290,0xa1d89,0xa1883,
 +        0xa137d,0xa0e78,0xa0974,0xa0470,0x9ff6c,0x9fa69,0x9f567,0x9f065,
 +        0x9eb64,0x9e663,0x9e163,0x9dc63,0x9d764,0x9d266,0x9cd68,0x9c86a,
 +        0x9c36d,0x9be71,0x9b975,0x9b47a,0x9af7f,0x9aa85,0x9a58b,0x9a092,
 +        0x99b9a,0x996a1,0x991aa,0x98cb3,0x987bd,0x982c7,0x97dd1,0x978dc,
 +        0x973e8,0x96ef4,0x96a01,0x9650e,0x9601c,0x95b2b,0x9563a,0x95149,
 +        0x94c59,0x94769,0x9427a,0x93d8c,0x9389e,0x933b1,0x92ec4,0x929d8,
 +        0x924ec,0x92001,0x91b16,0x9162c,0x91142,0x90c59,0x90770,0x90288,
 +        0x8fda1,0x8f8ba,0x8f3d3,0x8eeed,0x8ea08,0x8e523,0x8e03e,0x8db5b,
 +        0x8d677,0x8d194,0x8ccb2,0x8c7d0,0x8c2ef,0x8be0e,0x8b92e,0x8b44e,
 +        0x8af6f,0x8aa91,0x8a5b2,0x8a0d5,0x89bf8,0x8971b,0x8923f,0x88d64,
 +        0x88889,0x883ae,0x87ed4,0x879fb,0x87522,0x87049,0x86b71,0x8669a,
 +        0x861c3,0x85ced,0x85817,0x85341,0x84e6d,0x84998,0x844c5,0x83ff1,
 +        0x83b1e,0x8364c,0x8317a,0x82ca9,0x827d8,0x82308,0x81e39,0x81969,
 +        0x8149b,0x80fcd,0x80aff,0x80632,0x80165,0x7fc99,0x7f7cd,0x7f302,
 +        0x7ee37,0x7e96d,0x7e4a4,0x7dfdb,0x7db12,0x7d64a,0x7d182,0x7ccbb,
 +        0x7c7f5,0x7c32f,0x7be69,0x7b9a4,0x7b4df,0x7b01b,0x7ab58,0x7a695,
 +        0x7a1d2,0x79d10,0x7984f,0x7938e,0x78ecd,0x78a0d,0x7854d,0x7808e,
 +        0x77bd0,0x77712,0x77254,0x76d97,0x768da,0x7641e,0x75f63,0x75aa8,
 +        0x755ed,0x75133,0x74c79,0x747c0,0x74308,0x73e50,0x73998,0x734e1,
 +        0x7302a,0x72b74,0x726be,0x72209,0x71d55,0x718a0,0x713ed,0x70f3a,
 +        0x70a87,0x705d5,0x70123,0x6fc72,0x6f7c1,0x6f311,0x6ee61,0x6e9b2,
 +        0x6e503,0x6e055,0x6dba7,0x6d6f9,0x6d24d,0x6cda0,0x6c8f4,0x6c449,
 +        0x6bf9e,0x6baf4,0x6b64a,0x6b1a0,0x6acf7,0x6a84f,0x6a3a7,0x69eff,
 +        0x69a58,0x695b2,0x6910c,0x68c66,0x687c1,0x6831d,0x67e78,0x679d5,
 +        0x67532,0x6708f,0x66bed,0x6674b,0x662aa,0x65e09,0x65969,0x654c9,
 +        0x65029,0x64b8a,0x646ec,0x6424e,0x63db1,0x63914,0x63477,0x62fdb,
 +        0x62b40,0x626a5,0x6220a,0x61d70,0x618d6,0x6143d,0x60fa4,0x60b0c,
 +        0x60674,0x601dd,0x5fd46,0x5f8b0,0x5f41a,0x5ef85,0x5eaf0,0x5e65b,
 +        0x5e1c7,0x5dd34,0x5d8a1,0x5d40e,0x5cf7c,0x5caea,0x5c659,0x5c1c9,
 +        0x5bd38,0x5b8a9,0x5b419,0x5af8a,0x5aafc,0x5a66e,0x5a1e1,0x59d54,
 +        0x598c7,0x5943b,0x58fb0,0x58b24,0x5869a,0x58210,0x57d86,0x578fd,
 +        0x57474,0x56feb,0x56b64,0x566dc,0x56255,0x55dcf,0x55949,0x554c3,
 +        0x5503e,0x54bb9,0x54735,0x542b1,0x53e2e,0x539ab,0x53529,0x530a7,
 +        0x52c25,0x527a4,0x52324,0x51ea4,0x51a24,0x515a5,0x51126,0x50ca8,
 +        0x5082a,0x503ad,0x4ff30,0x4fab4,0x4f638,0x4f1bc,0x4ed41,0x4e8c6,
 +        0x4e44c,0x4dfd3,0x4db59,0x4d6e0,0x4d268,0x4cdf0,0x4c979,0x4c502,
 +        0x4c08b,0x4bc15,0x4b79f,0x4b32a,0x4aeb5,0x4aa41,0x4a5cd,0x4a15a,
 +        0x49ce7,0x49874,0x49402,0x48f91,0x48b1f,0x486af,0x4823e,0x47dce,
 +        0x4795f,0x474f0,0x47082,0x46c14,0x467a6,0x46339,0x45ecc,0x45a60,
 +        0x455f4,0x45189,0x44d1e,0x448b3,0x44449,0x43fdf,0x43b76,0x4370d,
 +        0x432a5,0x42e3d,0x429d6,0x4256f,0x42108,0x41ca2,0x4183c,0x413d7,
 +        0x40f72,0x40b0e,0x406aa,0x40247,0x3fde4,0x3f981,0x3f51f,0x3f0bd,
 +        0x3ec5c,0x3e7fb,0x3e39b,0x3df3b,0x3dadb,0x3d67c,0x3d21d,0x3cdbf,
 +        0x3c961,0x3c504,0x3c0a7,0x3bc4a,0x3b7ee,0x3b393,0x3af37,0x3aadd,
 +        0x3a682,0x3a228,0x39dcf,0x39976,0x3951d,0x390c5,0x38c6d,0x38816,
 +        0x383bf,0x37f69,0x37b13,0x376bd,0x37268,0x36e13,0x369bf,0x3656b,
 +        0x36117,0x35cc4,0x35872,0x3541f,0x34fce,0x34b7c,0x3472b,0x342db,
 +        0x33e8b,0x33a3b,0x335ec,0x3319d,0x32d4f,0x32901,0x324b3,0x32066,
 +        0x31c1a,0x317cd,0x31381,0x30f36,0x30aeb,0x306a1,0x30256,0x2fe0d,
 +        0x2f9c3,0x2f57a,0x2f132,0x2ecea,0x2e8a2,0x2e45b,0x2e014,0x2dbce,
 +        0x2d788,0x2d343,0x2cefd,0x2cab9,0x2c675,0x2c231,0x2bded,0x2b9aa,
 +        0x2b568,0x2b125,0x2ace4,0x2a8a2,0x2a461,0x2a021,0x29be1,0x297a1,
 +        0x29362,0x28f23,0x28ae4,0x286a6,0x28269,0x27e2c,0x279ef,0x275b2,
 +        0x27176,0x26d3b,0x26900,0x264c5,0x2608b,0x25c51,0x25817,0x253de,
 +        0x24fa6,0x24b6d,0x24735,0x242fe,0x23ec7,0x23a90,0x2365a,0x23224,
 +        0x22def,0x229ba,0x22585,0x22151,0x21d1d,0x218ea,0x214b7,0x21084,
 +        0x20c52,0x20821,0x203ef,0x1ffbe,0x1fb8e,0x1f75e,0x1f32e,0x1eeff,
 +        0x1ead0,0x1e6a1,0x1e273,0x1de45,0x1da18,0x1d5eb,0x1d1bf,0x1cd93,
 +        0x1c967,0x1c53c,0x1c111,0x1bce6,0x1b8bc,0x1b493,0x1b069,0x1ac40,
 +        0x1a818,0x1a3f0,0x19fc8,0x19ba1,0x1977a,0x19354,0x18f2d,0x18b08,
 +        0x186e2,0x182be,0x17e99,0x17a75,0x17651,0x1722e,0x16e0b,0x169e9,
 +        0x165c6,0x161a5,0x15d83,0x15963,0x15542,0x15122,0x14d02,0x148e3,
 +        0x144c4,0x140a5,0x13c87,0x13869,0x1344c,0x1302f,0x12c12,0x127f6,
 +        0x123da,0x11fbf,0x11ba4,0x11789,0x1136f,0x10f55,0x10b3c,0x10723,
 +        0x1030a,0xfef2,0xfada,0xf6c2,0xf2ab,0xee95,0xea7e,0xe668,
 +        0xe253,0xde3e,0xda29,0xd614,0xd200,0xcded,0xc9da,0xc5c7,
 +        0xc1b4,0xbda2,0xb990,0xb57f,0xb16e,0xad5e,0xa94e,0xa53e,
 +        0xa12e,0x9d1f,0x9911,0x9503,0x90f5,0x8ce7,0x88da,0x84ce,
 +        0x80c1,0x7cb5,0x78aa,0x749f,0x7094,0x6c89,0x687f,0x6476,
 +        0x606d,0x5c64,0x585b,0x5453,0x504b,0x4c44,0x483d,0x4436,
 +        0x4030,0x3c2a,0x3825,0x3420,0x301b,0x2c17,0x2813,0x240f,
 +        0x200c,0x1c09,0x1807,0x1405,0x1003,0xc02,0x801,0x400,
 +        0x7fffff,0x7ff001,0x7fe006,0x7fd00d,0x7fc018,0x7fb025,0x7fa036,0x7f9049,
 +        0x7f8060,0x7f7079,0x7f6095,0x7f50b5,0x7f40d7,0x7f30fc,0x7f2124,0x7f114f,
 +        0x7f017e,0x7ef1af,0x7ee1e2,0x7ed219,0x7ec253,0x7eb290,0x7ea2d0,0x7e9312,
 +        0x7e8358,0x7e73a0,0x7e63eb,0x7e543a,0x7e448b,0x7e34df,0x7e2536,0x7e1590,
 +        0x7e05ec,0x7df64c,0x7de6ae,0x7dd714,0x7dc77c,0x7db7e7,0x7da855,0x7d98c6,
 +        0x7d893a,0x7d79b0,0x7d6a2a,0x7d5aa6,0x7d4b25,0x7d3ba7,0x7d2c2c,0x7d1cb3,
 +        0x7d0d3e,0x7cfdcb,0x7cee5b,0x7cdeee,0x7ccf84,0x7cc01d,0x7cb0b8,0x7ca156,
 +        0x7c91f7,0x7c829b,0x7c7342,0x7c63eb,0x7c5497,0x7c4546,0x7c35f8,0x7c26ad,
 +        0x7c1764,0x7c081e,0x7bf8db,0x7be99b,0x7bda5d,0x7bcb23,0x7bbbeb,0x7bacb5,
 +        0x7b9d83,0x7b8e53,0x7b7f26,0x7b6ffc,0x7b60d4,0x7b51b0,0x7b428e,0x7b336e,
 +        0x7b2452,0x7b1538,0x7b0621,0x7af70c,0x7ae7fb,0x7ad8ec,0x7ac9e0,0x7abad6,
 +        0x7aabcf,0x7a9ccb,0x7a8dca,0x7a7ecb,0x7a6fcf,0x7a60d5,0x7a51df,0x7a42eb,
 +        0x7a33f9,0x7a250b,0x7a161f,0x7a0735,0x79f84f,0x79e96b,0x79da89,0x79cbab,
 +        0x79bccf,0x79adf5,0x799f1f,0x79904a,0x798179,0x7972aa,0x7963de,0x795515,
 +        0x79464e,0x793789,0x7928c8,0x791a09,0x790b4c,0x78fc92,0x78eddb,0x78df27,
 +        0x78d075,0x78c1c5,0x78b319,0x78a46e,0x7895c7,0x788722,0x78787f,0x7869e0,
 +        0x785b42,0x784ca8,0x783e10,0x782f7a,0x7820e7,0x781257,0x7803c9,0x77f53e,
 +        0x77e6b5,0x77d82f,0x77c9ab,0x77bb2a,0x77acac,0x779e30,0x778fb6,0x77813f,
 +        0x7772cb,0x776459,0x7755ea,0x77477d,0x773913,0x772aab,0x771c46,0x770de3,
 +        0x76ff83,0x76f125,0x76e2ca,0x76d472,0x76c61b,0x76b7c8,0x76a977,0x769b28,
 +        0x768cdc,0x767e92,0x76704b,0x766206,0x7653c4,0x764584,0x763747,0x76290c,
 +        0x761ad3,0x760c9d,0x75fe6a,0x75f039,0x75e20a,0x75d3de,0x75c5b5,0x75b78e,
 +        0x75a969,0x759b46,0x758d27,0x757f09,0x7570ee,0x7562d6,0x7554bf,0x7546ac,
 +        0x75389a,0x752a8c,0x751c7f,0x750e75,0x75006d,0x74f268,0x74e465,0x74d665,
 +        0x74c867,0x74ba6b,0x74ac72,0x749e7b,0x749087,0x748295,0x7474a5,0x7466b8,
 +        0x7458cd,0x744ae4,0x743cfe,0x742f1a,0x742139,0x74135a,0x74057d,0x73f7a3,
 +        0x73e9cb,0x73dbf5,0x73ce22,0x73c051,0x73b282,0x73a4b6,0x7396ec,0x738925,
 +        0x737b60,0x736d9d,0x735fdc,0x73521e,0x734462,0x7336a9,0x7328f1,0x731b3c,
 +        0x730d8a,0x72ffd9,0x72f22c,0x72e480,0x72d6d7,0x72c92f,0x72bb8b,0x72ade8,
 +        0x72a048,0x7292aa,0x72850f,0x727775,0x7269de,0x725c4a,0x724eb7,0x724127,
 +        0x723399,0x72260e,0x721884,0x720afd,0x71fd79,0x71eff6,0x71e276,0x71d4f8,
 +        0x71c77c,0x71ba02,0x71ac8b,0x719f16,0x7191a3,0x718433,0x7176c5,0x716959,
 +        0x715bef,0x714e87,0x714122,0x7133bf,0x71265e,0x711900,0x710ba3,0x70fe49,
 +        0x70f0f1,0x70e39b,0x70d648,0x70c8f6,0x70bba7,0x70ae5a,0x70a110,0x7093c7,
 +        0x708681,0x70793d,0x706bfb,0x705ebb,0x70517d,0x704442,0x703709,0x7029d2,
 +        0x701c9d,0x700f6a,0x70023a,0x6ff50c,0x6fe7e0,0x6fdab6,0x6fcd8e,0x6fc068,
 +        0x6fb345,0x6fa624,0x6f9904,0x6f8be7,0x6f7ecd,0x6f71b4,0x6f649d,0x6f5789,
 +        0x6f4a77,0x6f3d67,0x6f3059,0x6f234d,0x6f1643,0x6f093c,0x6efc36,0x6eef33,
 +        0x6ee232,0x6ed533,0x6ec836,0x6ebb3b,0x6eae42,0x6ea14c,0x6e9457,0x6e8765,
 +        0x6e7a74,0x6e6d86,0x6e609a,0x6e53b0,0x6e46c8,0x6e39e3,0x6e2cff,0x6e201d,
 +        0x6e133e,0x6e0661,0x6df985,0x6decac,0x6ddfd5,0x6dd300,0x6dc62d,0x6db95c,
 +        0x6dac8d,0x6d9fc0,0x6d92f5,0x6d862d,0x6d7966,0x6d6ca2,0x6d5fdf,0x6d531f,
 +        0x6d4660,0x6d39a4,0x6d2cea,0x6d2032,0x6d137c,0x6d06c7,0x6cfa15,0x6ced65,
 +        0x6ce0b7,0x6cd40b,0x6cc761,0x6cbab9,0x6cae14,0x6ca170,0x6c94ce,0x6c882e,
 +        0x6c7b90,0x6c6ef5,0x6c625b,0x6c55c3,0x6c492d,0x6c3c9a,0x6c3008,0x6c2378,
 +        0x6c16ea,0x6c0a5f,0x6bfdd5,0x6bf14d,0x6be4c8,0x6bd844,0x6bcbc2,0x6bbf42,
 +        0x6bb2c5,0x6ba649,0x6b99cf,0x6b8d57,0x6b80e2,0x6b746e,0x6b67fc,0x6b5b8c,
 +        0x6b4f1e,0x6b42b2,0x6b3648,0x6b29e0,0x6b1d7a,0x6b1116,0x6b04b4,0x6af854,
 +        0x6aebf5,0x6adf99,0x6ad33f,0x6ac6e6,0x6aba90,0x6aae3b,0x6aa1e9,0x6a9598,
 +        0x6a8949,0x6a7cfd,0x6a70b2,0x6a6469,0x6a5822,0x6a4bdd,0x6a3f9a,0x6a3359,
 +        0x6a271a,0x6a1adc,0x6a0ea1,0x6a0267,0x69f630,0x69e9fa,0x69ddc6,0x69d195,
 +        0x69c565,0x69b937,0x69ad0b,0x69a0e0,0x6994b8,0x698892,0x697c6d,0x69704a,
 +        0x69642a,0x69580b,0x694bee,0x693fd3,0x6933ba,0x6927a2,0x691b8d,0x690f79,
 +        0x690368,0x68f758,0x68eb4a,0x68df3e,0x68d334,0x68c72b,0x68bb25,0x68af20,
 +        0x68a31d,0x68971d,0x688b1d,0x687f20,0x687325,0x68672c,0x685b34,0x684f3e,
 +        0x68434a,0x683758,0x682b68,0x681f7a,0x68138d,0x6807a2,0x67fbb9,0x67efd2,
 +        0x67e3ed,0x67d80a,0x67cc28,0x67c048,0x67b46a,0x67a88e,0x679cb4,0x6790dc,
 +        0x678505,0x677930,0x676d5d,0x67618c,0x6755bd,0x6749ef,0x673e23,0x673259,
 +        0x672691,0x671acb,0x670f06,0x670343,0x66f782,0x66ebc3,0x66e006,0x66d44a,
 +        0x66c891,0x66bcd8,0x66b122,0x66a56e,0x6699bb,0x668e0a,0x66825b,0x6676ae,
 +        0x666b02,0x665f58,0x6653b0,0x66480a,0x663c66,0x6630c3,0x662522,0x661983,
 +        0x660de5,0x66024a,0x65f6b0,0x65eb17,0x65df81,0x65d3ec,0x65c859,0x65bcc8,
 +        0x65b139,0x65a5ab,0x659a1f,0x658e95,0x65830d,0x657786,0x656c01,0x65607e,
 +        0x6554fc,0x65497c,0x653dfe,0x653282,0x652707,0x651b8e,0x651017,0x6504a2,
 +        0x64f92e,0x64edbc,0x64e24c,0x64d6dd,0x64cb70,0x64c005,0x64b49c,0x64a934,
 +        0x649dce,0x64926a,0x648707,0x647ba6,0x647047,0x6464ea,0x64598e,0x644e34,
 +        0x6442db,0x643784,0x642c2f,0x6420dc,0x64158a,0x640a3a,0x63feec,0x63f39f,
 +        0x63e854,0x63dd0b,0x63d1c3,0x63c67d,0x63bb39,0x63aff7,0x63a4b6,0x639976,
 +        0x638e39,0x6382fd,0x6377c3,0x636c8a,0x636153,0x63561e,0x634aea,0x633fb8,
 +        0x633488,0x632959,0x631e2c,0x631301,0x6307d7,0x62fcaf,0x62f189,0x62e664,
 +        0x62db41,0x62d01f,0x62c500,0x62b9e1,0x62aec5,0x62a3aa,0x629890,0x628d79,
 +        0x628263,0x62774e,0x626c3b,0x62612a,0x62561b,0x624b0d,0x624000,0x6234f6,
 +        0x6229ed,0x621ee5,0x6213df,0x6208db,0x61fdd8,0x61f2d7,0x61e7d8,0x61dcda,
 +        0x61d1de,0x61c6e3,0x61bbea,0x61b0f3,0x61a5fd,0x619b09,0x619016,0x618525,
 +        0x617a36,0x616f48,0x61645b,0x615971,0x614e88,0x6143a0,0x6138ba,0x612dd6,
 +        0x6122f3,0x611812,0x610d32,0x610254,0x60f778,0x60ec9d,0x60e1c4,0x60d6ec,
 +        0x60cc16,0x60c141,0x60b66e,0x60ab9c,0x60a0cc,0x6095fe,0x608b31,0x608066,
 +        0x60759c,0x606ad4,0x60600e,0x605549,0x604a85,0x603fc3,0x603503,0x602a44,
 +        0x601f87,0x6014cb,0x600a11,0x5fff58,0x5ff4a1,0x5fe9eb,0x5fdf37,0x5fd485,
 +        0x5fc9d4,0x5fbf24,0x5fb476,0x5fa9ca,0x5f9f1f,0x5f9476,0x5f89ce,0x5f7f28,
 +        0x5f7483,0x5f69df,0x5f5f3e,0x5f549d,0x5f49ff,0x5f3f62,0x5f34c6,0x5f2a2c,
 +        0x5f1f93,0x5f14fc,0x5f0a66,0x5effd2,0x5ef53f,0x5eeaae,0x5ee01f,0x5ed591,
 +        0x5ecb04,0x5ec079,0x5eb5ef,0x5eab67,0x5ea0e0,0x5e965b,0x5e8bd8,0x5e8155,
 +        0x5e76d5,0x5e6c55,0x5e61d8,0x5e575c,0x5e4ce1,0x5e4268,0x5e37f0,0x5e2d79,
 +        0x5e2305,0x5e1891,0x5e0e1f,0x5e03af,0x5df940,0x5deed3,0x5de467,0x5dd9fc,
 +        0x5dcf93,0x5dc52b,0x5dbac5,0x5db061,0x5da5fd,0x5d9b9c,0x5d913b,0x5d86dc,
 +        0x5d7c7f,0x5d7223,0x5d67c9,0x5d5d70,0x5d5318,0x5d48c2,0x5d3e6d,0x5d341a,
 +        0x5d29c8,0x5d1f78,0x5d1529,0x5d0adc,0x5d0090,0x5cf645,0x5cebfc,0x5ce1b4,
 +        0x5cd76e,0x5ccd29,0x5cc2e6,0x5cb8a4,0x5cae63,0x5ca424,0x5c99e6,0x5c8faa,
 +        0x5c856f,0x5c7b36,0x5c70fe,0x5c66c7,0x5c5c92,0x5c525e,0x5c482c,0x5c3dfb,
 +        0x5c33cc,0x5c299d,0x5c1f71,0x5c1546,0x5c0b1c,0x5c00f3,0x5bf6cc,0x5beca7,
 +        0x5be282,0x5bd85f,0x5bce3e,0x5bc41e,0x5bb9ff,0x5bafe2,0x5ba5c6,0x5b9bac,
 +        0x5b9193,0x5b877b,0x5b7d65,0x5b7350,0x5b693d,0x5b5f2a,0x5b551a,0x5b4b0a,
 +        0x5b40fd,0x5b36f0,0x5b2ce5,0x5b22db,0x5b18d3,0x5b0ecc,0x5b04c6,0x5afac2,
 +        0x5af0bf,0x5ae6bd,0x5adcbd,0x5ad2be,0x5ac8c1,0x5abec5,0x5ab4ca,0x5aaad1,
 +        0x5aa0d9,0x5a96e2,0x5a8ced,0x5a82f9,0x5a7906,0x5a6f15,0x5a6525,0x5a5b37,
 +        0x5a514a,0x5a475e,0x5a3d74,0x5a338b,0x5a29a3,0x5a1fbd,0x5a15d8,0x5a0bf4,
 +        0x5a0212,0x59f831,0x59ee51,0x59e473,0x59da96,0x59d0ba,0x59c6e0,0x59bd07,
 +        0x59b330,0x59a959,0x599f84,0x5995b1,0x598bde,0x59820e,0x59783e,0x596e70,
 +        0x5964a3,0x595ad7,0x59510d,0x594744,0x593d7c,0x5933b6,0x5929f1,0x59202d,
 +        0x59166b,0x590caa,0x5902ea,0x58f92b,0x58ef6e,0x58e5b3,0x58dbf8,0x58d23f,
 +        0x58c887,0x58bed0,0x58b51b,0x58ab67,0x58a1b4,0x589803,0x588e53,0x5884a4,
 +        0x587af7,0x58714b,0x5867a0,0x585df6,0x58544e,0x584aa7,0x584101,0x58375d,
 +        0x582dba,0x582418,0x581a77,0x5810d8,0x58073a,0x57fd9d,0x57f402,0x57ea68,
 +        0x57e0cf,0x57d737,0x57cda1,0x57c40c,0x57ba78,0x57b0e6,0x57a754,0x579dc5,
 +        0x579436,0x578aa9,0x57811c,0x577792,0x576e08,0x576480,0x575af9,0x575173,
 +        0x5747ee,0x573e6b,0x5734e9,0x572b68,0x5721e9,0x57186b,0x570eee,0x570572,
 +        0x56fbf8,0x56f27e,0x56e906,0x56df90,0x56d61a,0x56cca6,0x56c333,0x56b9c1,
 +        0x56b051,0x56a6e2,0x569d74,0x569407,0x568a9b,0x568131,0x5677c8,0x566e60,
 +        0x5664fa,0x565b95,0x565231,0x5648ce,0x563f6c,0x56360c,0x562cad,0x56234f,
 +        0x5619f2,0x561097,0x56073c,0x55fde3,0x55f48c,0x55eb35,0x55e1e0,0x55d88c,
 +        0x55cf39,0x55c5e7,0x55bc97,0x55b347,0x55a9f9,0x55a0ad,0x559761,0x558e17,
 +        0x5584cd,0x557b86,0x55723f,0x5568f9,0x555fb5,0x555672,0x554d30,0x5543ef,
 +        0x553ab0,0x553171,0x552834,0x551ef8,0x5515be,0x550c84,0x55034c,0x54fa15,
 +        0x54f0df,0x54e7aa,0x54de77,0x54d544,0x54cc13,0x54c2e3,0x54b9b4,0x54b087,
 +        0x54a75a,0x549e2f,0x549505,0x548bdc,0x5482b5,0x54798e,0x547069,0x546745,
 +        0x545e22,0x545500,0x544be0,0x5442c0,0x5439a2,0x543085,0x542769,0x541e4f,
 +        0x541535,0x540c1d,0x540306,0x53f9f0,0x53f0db,0x53e7c7,0x53deb5,0x53d5a3,
 +        0x53cc93,0x53c384,0x53ba76,0x53b169,0x53a85e,0x539f54,0x53964a,0x538d42,
 +        0x53843b,0x537b36,0x537231,0x53692e,0x53602b,0x53572a,0x534e2a,0x53452b,
 +        0x533c2e,0x533331,0x532a36,0x53213b,0x531842,0x530f4a,0x530654,0x52fd5e,
 +        0x52f469,0x52eb76,0x52e284,0x52d993,0x52d0a3,0x52c7b4,0x52bec6,0x52b5d9,
 +        0x52acee,0x52a404,0x529b1b,0x529233,0x52894c,0x528066,0x527781,0x526e9e,
 +        0x5265bb,0x525cda,0x5253fa,0x524b1b,0x52423d,0x523960,0x523084,0x5227aa,
 +        0x521ed0,0x5215f8,0x520d21,0x52044b,0x51fb76,0x51f2a2,0x51e9cf,0x51e0fe,
 +        0x51d82d,0x51cf5e,0x51c68f,0x51bdc2,0x51b4f6,0x51ac2b,0x51a361,0x519a98,
 +        0x5191d1,0x51890a,0x518045,0x517780,0x516ebd,0x5165fb,0x515d3a,0x51547a,
 +        0x514bbb,0x5142fd,0x513a41,0x513185,0x5128cb,0x512011,0x511759,0x510ea2,
 +        0x5105ec,0x50fd36,0x50f483,0x50ebd0,0x50e31e,0x50da6d,0x50d1be,0x50c90f,
 +        0x50c062,0x50b7b5,0x50af0a,0x50a660,0x509db7,0x50950f,0x508c68,0x5083c2,
 +        0x507b1d,0x507279,0x5069d7,0x506135,0x505894,0x504ff5,0x504757,0x503eb9,
 +        0x50361d,0x502d82,0x5024e8,0x501c4f,0x5013b7,0x500b20,0x50028a,0x4ff9f5,
 +        0x4ff162,0x4fe8cf,0x4fe03d,0x4fd7ad,0x4fcf1d,0x4fc68f,0x4fbe01,0x4fb575,
 +        0x4facea,0x4fa460,0x4f9bd7,0x4f934e,0x4f8ac7,0x4f8241,0x4f79bc,0x4f7139,
 +        0x4f68b6,0x4f6034,0x4f57b3,0x4f4f33,0x4f46b5,0x4f3e37,0x4f35bb,0x4f2d3f,
 +        0x4f24c5,0x4f1c4b,0x4f13d3,0x4f0b5b,0x4f02e5,0x4efa70,0x4ef1fb,0x4ee988,
 +        0x4ee116,0x4ed8a5,0x4ed035,0x4ec7c6,0x4ebf58,0x4eb6ea,0x4eae7e,0x4ea613,
 +        0x4e9daa,0x4e9541,0x4e8cd9,0x4e8472,0x4e7c0c,0x4e73a7,0x4e6b43,0x4e62e1,
 +        0x4e5a7f,0x4e521e,0x4e49be,0x4e4160,0x4e3902,0x4e30a5,0x4e284a,0x4e1fef,
 +        0x4e1796,0x4e0f3d,0x4e06e5,0x4dfe8f,0x4df639,0x4dede5,0x4de591,0x4ddd3f,
 +        0x4dd4ed,0x4dcc9d,0x4dc44d,0x4dbbff,0x4db3b1,0x4dab65,0x4da319,0x4d9acf,
 +        0x4d9285,0x4d8a3d,0x4d81f5,0x4d79af,0x4d7169,0x4d6925,0x4d60e2,0x4d589f,
 +        0x4d505e,0x4d481d,0x4d3fde,0x4d379f,0x4d2f62,0x4d2725,0x4d1eea,0x4d16af,
 +        0x4d0e76,0x4d063d,0x4cfe05,0x4cf5cf,0x4ced99,0x4ce565,0x4cdd31,0x4cd4fe,
 +        0x4ccccd,0x4cc49c,0x4cbc6c,0x4cb43e,0x4cac10,0x4ca3e3,0x4c9bb8,0x4c938d,
 +        0x4c8b63,0x4c833a,0x4c7b12,0x4c72eb,0x4c6ac6,0x4c62a1,0x4c5a7d,0x4c525a,
 +        0x4c4a38,0x4c4217,0x4c39f7,0x4c31d7,0x4c29b9,0x4c219c,0x4c1980,0x4c1165,
 +        0x4c094b,0x4c0131,0x4bf919,0x4bf102,0x4be8eb,0x4be0d6,0x4bd8c1,0x4bd0ae,
 +        0x4bc89b,0x4bc089,0x4bb879,0x4bb069,0x4ba85a,0x4ba04d,0x4b9840,0x4b9034,
 +        0x4b8829,0x4b801f,0x4b7816,0x4b700e,0x4b6807,0x4b6001,0x4b57fc,0x4b4ff7,
 +        0x4b47f4,0x4b3ff2,0x4b37f0,0x4b2ff0,0x4b27f0,0x4b1ff2,0x4b17f4,0x4b0ff7,
 +        0x4b07fc,0x4b0001,0x4af807,0x4af00e,0x4ae816,0x4ae01f,0x4ad829,0x4ad034,
 +        0x4ac83f,0x4ac04c,0x4ab85a,0x4ab068,0x4aa878,0x4aa088,0x4a989a,0x4a90ac,
 +        0x4a88bf,0x4a80d3,0x4a78e8,0x4a70fe,0x4a6915,0x4a612d,0x4a5946,0x4a5160,
 +        0x4a497a,0x4a4196,0x4a39b2,0x4a31d0,0x4a29ee,0x4a220d,0x4a1a2d,0x4a124f,
 +        0x4a0a71,0x4a0294,0x49fab7,0x49f2dc,0x49eb02,0x49e328,0x49db50,0x49d378,
 +        0x49cba2,0x49c3cc,0x49bbf7,0x49b423,0x49ac50,0x49a47e,0x499cad,0x4994dd,
 +        0x498d0d,0x49853f,0x497d71,0x4975a5,0x496dd9,0x49660e,0x495e44,0x49567b,
 +        0x494eb3,0x4946ec,0x493f25,0x493760,0x492f9b,0x4927d8,0x492015,0x491853,
 +        0x491092,0x4908d2,0x490113,0x48f955,0x48f198,0x48e9db,0x48e21f,0x48da65,
 +        0x48d2ab,0x48caf2,0x48c33a,0x48bb83,0x48b3cd,0x48ac18,0x48a463,0x489cb0,
 +        0x4894fd,0x488d4b,0x48859a,0x487dea,0x48763b,0x486e8d,0x4866df,0x485f33,
 +        0x485787,0x484fdd,0x484833,0x48408a,0x4838e2,0x48313b,0x482994,0x4821ef,
 +        0x481a4a,0x4812a6,0x480b04,0x480362,0x47fbc1,0x47f420,0x47ec81,0x47e4e3,
 +        0x47dd45,0x47d5a8,0x47ce0c,0x47c672,0x47bed7,0x47b73e,0x47afa6,0x47a80e,
 +        0x47a078,0x4798e2,0x47914d,0x4789b9,0x478226,0x477a93,0x477302,0x476b71,
 +        0x4763e2,0x475c53,0x4754c5,0x474d37,0x4745ab,0x473e20,0x473695,0x472f0b,
 +        0x472783,0x471ffa,0x471873,0x4710ed,0x470968,0x4701e3,0x46fa5f,0x46f2dc,
 +        0x46eb5a,0x46e3d9,0x46dc59,0x46d4d9,0x46cd5a,0x46c5dd,0x46be60,0x46b6e4,
 +        0x46af68,0x46a7ee,0x46a074,0x4698fb,0x469184,0x468a0c,0x468296,0x467b21,
 +        0x4673ac,0x466c39,0x4664c6,0x465d54,0x4655e3,0x464e72,0x464703,0x463f94,
 +        0x463826,0x4630b9,0x46294d,0x4621e2,0x461a77,0x46130e,0x460ba5,0x46043d,
 +        0x45fcd6,0x45f56f,0x45ee0a,0x45e6a5,0x45df41,0x45d7de,0x45d07c,0x45c91a,
 +        0x45c1ba,0x45ba5a,0x45b2fb,0x45ab9d,0x45a440,0x459ce4,0x459588,0x458e2d,
 +        0x4586d3,0x457f7a,0x457822,0x4570ca,0x456974,0x45621e,0x455ac9,0x455374,
 +        0x454c21,0x4544ce,0x453d7d,0x45362c,0x452edb,0x45278c,0x45203e,0x4518f0,
 +        0x4511a3,0x450a57,0x45030c,0x44fbc1,0x44f477,0x44ed2e,0x44e5e6,0x44de9f,
 +        0x44d759,0x44d013,0x44c8ce,0x44c18a,0x44ba47,0x44b305,0x44abc3,0x44a482,
 +        0x449d42,0x449603,0x448ec5,0x448787,0x44804a,0x44790e,0x4471d3,0x446a99,
 +        0x44635f,0x445c26,0x4454ee,0x444db7,0x444681,0x443f4b,0x443816,0x4430e2,
 +        0x4429af,0x44227c,0x441b4b,0x44141a,0x440cea,0x4405ba,0x43fe8c,0x43f75e,
 +        0x43f031,0x43e905,0x43e1da,0x43daaf,0x43d385,0x43cc5c,0x43c534,0x43be0d,
 +        0x43b6e6,0x43afc0,0x43a89b,0x43a177,0x439a54,0x439331,0x438c0f,0x4384ee,
 +        0x437dcd,0x4376ae,0x436f8f,0x436871,0x436154,0x435a37,0x43531b,0x434c00,
 +        0x4344e6,0x433dcd,0x4336b4,0x432f9c,0x432885,0x43216f,0x431a5a,0x431345,
 +        0x430c31,0x43051e,0x42fe0b,0x42f6f9,0x42efe9,0x42e8d8,0x42e1c9,0x42daba,
 +        0x42d3ad,0x42cca0,0x42c593,0x42be88,0x42b77d,0x42b073,0x42a96a,0x42a261,
 +        0x429b59,0x429452,0x428d4c,0x428647,0x427f42,0x42783e,0x42713b,0x426a39,
 +        0x426337,0x425c36,0x425536,0x424e37,0x424738,0x42403a,0x42393d,0x423241,
 +        0x422b45,0x42244a,0x421d50,0x421657,0x420f5e,0x420866,0x42016f,0x41fa79,
 +        0x41f383,0x41ec8e,0x41e59a,0x41dea7,0x41d7b4,0x41d0c2,0x41c9d1,0x41c2e1,
 +        0x41bbf1,0x41b503,0x41ae14,0x41a727,0x41a03a,0x41994e,0x419263,0x418b79,
 +        0x41848f,0x417da6,0x4176be,0x416fd7,0x4168f0,0x41620a,0x415b25,0x415440,
 +        0x414d5c,0x414679,0x413f97,0x4138b6,0x4131d5,0x412af5,0x412415,0x411d37,
 +        0x411659,0x410f7c,0x41089f,0x4101c3,0x40fae9,0x40f40e,0x40ed35,0x40e65c,
 +        0x40df84,0x40d8ad,0x40d1d6,0x40cb00,0x40c42b,0x40bd57,0x40b683,0x40afb0,
 +        0x40a8de,0x40a20c,0x409b3b,0x40946b,0x408d9c,0x4086cd,0x408000,0x407932,
 +        0x407266,0x406b9a,0x4064cf,0x405e05,0x40573b,0x405072,0x4049aa,0x4042e3,
 +        0x403c1c,0x403556,0x402e91,0x4027cc,0x402109,0x401a45,0x401383,0x400cc1,
 +        0x400600,0x3fff40,0x3ff880,0x3ff1c2,0x3feb03,0x3fe446,0x3fdd89,0x3fd6cd,
 +        0x3fd012,0x3fc957,0x3fc29d,0x3fbbe4,0x3fb52c,0x3fae74,0x3fa7bd,0x3fa107,
 +        0x3f9a51,0x3f939c,0x3f8ce8,0x3f8634,0x3f7f81,0x3f78cf,0x3f721e,0x3f6b6d,
 +        0x3f64bd,0x3f5e0e,0x3f575f,0x3f50b1,0x3f4a04,0x3f4357,0x3f3cac,0x3f3601,
 +        0x3f2f56,0x3f28ac,0x3f2203,0x3f1b5b,0x3f14b3,0x3f0e0c,0x3f0766,0x3f00c1,
 +        0x3efa1c,0x3ef377,0x3eecd4,0x3ee631,0x3edf8f,0x3ed8ee,0x3ed24d,0x3ecbad,
 +        0x3ec50e,0x3ebe6f,0x3eb7d1,0x3eb134,0x3eaa97,0x3ea3fb,0x3e9d60,0x3e96c6,
 +        0x3e902c,0x3e8993,0x3e82fa,0x3e7c62,0x3e75cb,0x3e6f35,0x3e689f,0x3e620a,
 +        0x3e5b76,0x3e54e2,0x3e4e4f,0x3e47bd,0x3e412b,0x3e3a9a,0x3e340a,0x3e2d7a,
 +        0x3e26eb,0x3e205d,0x3e19cf,0x3e1342,0x3e0cb6,0x3e062b,0x3dffa0,0x3df916,
 +        0x3df28c,0x3dec03,0x3de57b,0x3ddef4,0x3dd86d,0x3dd1e7,0x3dcb61,0x3dc4dc,
 +        0x3dbe58,0x3db7d5,0x3db152,0x3daad0,0x3da44f,0x3d9dce,0x3d974e,0x3d90ce,
 +        0x3d8a4f,0x3d83d1,0x3d7d54,0x3d76d7,0x3d705b,0x3d69e0,0x3d6365,0x3d5ceb,
 +        0x3d5671,0x3d4ff9,0x3d4980,0x3d4309,0x3d3c92,0x3d361c,0x3d2fa7,0x3d2932,
 +        0x3d22be,0x3d1c4a,0x3d15d7,0x3d0f65,0x3d08f4,0x3d0283,0x3cfc13,0x3cf5a3,
 +        0x3cef34,0x3ce8c6,0x3ce259,0x3cdbec,0x3cd57f,0x3ccf14,0x3cc8a9,0x3cc23f,
 +        0x3cbbd5,0x3cb56c,0x3caf04,0x3ca89c,0x3ca235,0x3c9bcf,0x3c9569,0x3c8f04,
 +        0x3c889f,0x3c823c,0x3c7bd8,0x3c7576,0x3c6f14,0x3c68b3,0x3c6253,0x3c5bf3,
 +        0x3c5593,0x3c4f35,0x3c48d7,0x3c427a,0x3c3c1d,0x3c35c1,0x3c2f66,0x3c290b,
 +        0x3c22b1,0x3c1c57,0x3c15ff,0x3c0fa7,0x3c094f,0x3c02f8,0x3bfca2,0x3bf64c,
 +        0x3beff7,0x3be9a3,0x3be34f,0x3bdcfc,0x3bd6aa,0x3bd058,0x3bca07,0x3bc3b7,
 +        0x3bbd67,0x3bb718,0x3bb0c9,0x3baa7b,0x3ba42e,0x3b9de1,0x3b9795,0x3b914a,
 +        0x3b8aff,0x3b84b5,0x3b7e6c,0x3b7823,0x3b71db,0x3b6b93,0x3b654c,0x3b5f06,
 +        0x3b58c0,0x3b527b,0x3b4c36,0x3b45f3,0x3b3faf,0x3b396d,0x3b332b,0x3b2cea,
 +        0x3b26a9,0x3b2069,0x3b1a2a,0x3b13eb,0x3b0dad,0x3b076f,0x3b0132,0x3afaf6,
 +        0x3af4ba,0x3aee7f,0x3ae845,0x3ae20b,0x3adbd2,0x3ad599,0x3acf61,0x3ac92a,
 +        0x3ac2f3,0x3abcbd,0x3ab688,0x3ab053,0x3aaa1f,0x3aa3eb,0x3a9db8,0x3a9786,
 +        0x3a9154,0x3a8b23,0x3a84f2,0x3a7ec2,0x3a7893,0x3a7264,0x3a6c36,0x3a6609,
 +        0x3a5fdc,0x3a59b0,0x3a5384,0x3a4d59,0x3a472f,0x3a4105,0x3a3adc,0x3a34b4,
 +        0x3a2e8c,0x3a2864,0x3a223e,0x3a1c18,0x3a15f2,0x3a0fcd,0x3a09a9,0x3a0385,
 +        0x39fd62,0x39f740,0x39f11e,0x39eafd,0x39e4dc,0x39debc,0x39d89d,0x39d27e,
 +        0x39cc60,0x39c642,0x39c025,0x39ba09,0x39b3ed,0x39add2,0x39a7b7,0x39a19d,
 +        0x399b84,0x39956b,0x398f53,0x39893b,0x398324,0x397d0e,0x3976f8,0x3970e3,
 +        0x396ace,0x3964ba,0x395ea7,0x395894,0x395282,0x394c70,0x39465f,0x39404f,
 +        0x393a3f,0x393430,0x392e21,0x392813,0x392206,0x391bf9,0x3915ed,0x390fe1,
 +        0x3909d6,0x3903cb,0x38fdc1,0x38f7b8,0x38f1af,0x38eba7,0x38e5a0,0x38df99,
 +        0x38d993,0x38d38d,0x38cd88,0x38c783,0x38c17f,0x38bb7c,0x38b579,0x38af77,
 +        0x38a975,0x38a374,0x389d73,0x389774,0x389174,0x388b76,0x388577,0x387f7a,
 +        0x38797d,0x387381,0x386d85,0x38678a,0x38618f,0x385b95,0x38559b,0x384fa2,
 +        0x3849aa,0x3843b2,0x383dbb,0x3837c5,0x3831cf,0x382bd9,0x3825e4,0x381ff0,
 +        0x3819fd,0x381409,0x380e17,0x380825,0x380234,0x37fc43,0x37f653,0x37f063,
 +        0x37ea74,0x37e485,0x37de97,0x37d8aa,0x37d2bd,0x37ccd1,0x37c6e5,0x37c0fa,
 +        0x37bb10,0x37b526,0x37af3d,0x37a954,0x37a36c,0x379d84,0x37979d,0x3791b6,
 +        0x378bd0,0x3785eb,0x378006,0x377a22,0x37743e,0x376e5b,0x376879,0x376297,
 +        0x375cb5,0x3756d5,0x3750f4,0x374b15,0x374535,0x373f57,0x373979,0x37339b,
 +        0x372dbf,0x3727e2,0x372206,0x371c2b,0x371651,0x371077,0x370a9d,0x3704c4,
 +        0x36feec,0x36f914,0x36f33d,0x36ed66,0x36e790,0x36e1ba,0x36dbe5,0x36d611,
 +        0x36d03d,0x36ca69,0x36c497,0x36bec4,0x36b8f3,0x36b321,0x36ad51,0x36a781,
 +        0x36a1b1,0x369be2,0x369614,0x369046,0x368a79,0x3684ac,0x367ee0,0x367915,
 +        0x36734a,0x366d7f,0x3667b5,0x3661ec,0x365c23,0x36565b,0x365093,0x364acc,
 +        0x364505,0x363f3f,0x363979,0x3633b4,0x362df0,0x36282c,0x362269,0x361ca6,
 +        0x3616e4,0x361122,0x360b61,0x3605a0,0x35ffe0,0x35fa20,0x35f461,0x35eea3,
 +        0x35e8e5,0x35e328,0x35dd6b,0x35d7af,0x35d1f3,0x35cc38,0x35c67d,0x35c0c3,
 +        0x35bb09,0x35b550,0x35af98,0x35a9e0,0x35a429,0x359e72,0x3598bb,0x359306,
 +        0x358d50,0x35879c,0x3581e8,0x357c34,0x357681,0x3570ce,0x356b1c,0x35656b,
 +        0x355fba,0x355a09,0x355459,0x354eaa,0x3548fb,0x35434d,0x353d9f,0x3537f2,
 +        0x353245,0x352c99,0x3526ee,0x352143,0x351b98,0x3515ee,0x351045,0x350a9c  
 +    }
 +};
 +
 +
 +
 +/* Shortcuts so we dont have to bother with the structure in C */
 +
 +/* Pointer to exponential table */
 +const unsigned int *
 +gmx_invsqrt_exptab   = F77_FUNC(gmxinvsqrtdata,GMXINVSQRTDATA).exptab;
 +
 +/* Pointer to fraction table */
 +const unsigned int *
 +gmx_invsqrt_fracttab = F77_FUNC(gmxinvsqrtdata,GMXINVSQRTDATA).fracttab;
 +
index dc7bd69ca1a925af381b347adce9055893d488e8,0000000000000000000000000000000000000000..3534d996ba150d454aaf46289176f0a3efee2cc5
mode 100644,000000..100644
--- /dev/null
@@@ -1,604 -1,0 +1,619 @@@
-                  t_fcdata *fcd,t_state *state)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "typedefs.h"
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "futil.h"
 +#include "xvgr.h"
 +#include "gmx_fatal.h"
 +#include "bondf.h"
 +#include "copyrite.h"
 +#include "disre.h"
 +#include "main.h"
 +#include "mtop_util.h"
 +
 +void init_disres(FILE *fplog,const gmx_mtop_t *mtop,
 +                 t_inputrec *ir,const t_commrec *cr,gmx_bool bPartDecomp,
-     if (cr && cr->ms != NULL && ptr != NULL)
++                 t_fcdata *fcd,t_state *state, gmx_bool bIsREMD)
 +{
 +    int          fa,nmol,i,npair,np;
 +    t_iparams    *ip;
 +    t_disresdata *dd;
 +    history_t    *hist;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist      *il;
 +    char         *ptr;
 +    
 +    dd = &(fcd->disres);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_DISRES) == 0)
 +    {
 +        dd->nres = 0;
 +
 +        return;
 +    }
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Initializing the distance restraints\n");
 +    }
 +    
 +
 +    if (ir->eDisre == edrEnsemble)
 +    {
 +        gmx_fatal(FARGS,"Sorry, distance restraints with ensemble averaging over multiple molecules in one system are not functional in this version of GROMACS");
 +    }
 +
 +    dd->dr_weighting = ir->eDisreWeighting;
 +    dd->dr_fc        = ir->dr_fc;
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        dd->dr_tau   = ir->dr_tau;
 +    }
 +    else
 +    {
 +        dd->dr_tau   = 0.0;
 +    }
 +    if (dd->dr_tau == 0.0)
 +    {
 +        dd->dr_bMixed = FALSE;
 +        dd->ETerm = 0.0;
 +    }
 +    else
 +    {
 +        dd->dr_bMixed = ir->bDisreMixed;
 +        dd->ETerm = exp(-(ir->delta_t/ir->dr_tau));
 +    }
 +    dd->ETerm1        = 1.0 - dd->ETerm;
 +    
 +    ip = mtop->ffparams.iparams;
 +
 +    dd->nres  = 0;
 +    dd->npair = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol)) {
 +        np = 0;
 +        for(fa=0; fa<il[F_DISRES].nr; fa+=3)
 +        {
 +            np++;
 +            npair = mtop->ffparams.iparams[il[F_DISRES].iatoms[fa]].disres.npair;
 +            if (np == npair)
 +            {
 +                dd->nres  += (ir->eDisre==edrEnsemble ? 1 : nmol)*npair;
 +                dd->npair += nmol*npair;
 +                np = 0;
 +            }
 +        }
 +    }
 +
 +    if (cr && PAR(cr) && !bPartDecomp)
 +    {
 +        /* Temporary check, will be removed when disre is implemented with DD */
 +        const char *notestr="NOTE: atoms involved in distance restraints should be within the longest cut-off distance, if this is not the case mdrun generates a fatal error, in that case use particle decomposition (mdrun option -pd)";
 +        
 +        if (MASTER(cr))
 +            fprintf(stderr,"\n%s\n\n",notestr);
 +        if (fplog)
 +            fprintf(fplog,"%s\n",notestr);
 +
 +        if (dd->dr_tau != 0 || ir->eDisre == edrEnsemble || cr->ms != NULL ||
 +            dd->nres != dd->npair)
 +        {
 +            gmx_fatal(FARGS,"Time or ensemble averaged or multiple pair distance restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        if (ir->nstdisreout != 0)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nWARNING: Can not write distance restraint data to energy file with domain decomposition\n\n");
 +            }
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,"\nWARNING: Can not write distance restraint data to energy file with domain decomposition\n");
 +            }
 +            ir->nstdisreout = 0;
 +        }
 +    }
 +
 +    snew(dd->rt,dd->npair);
 +    
 +    if (dd->dr_tau != 0.0)
 +    {
 +        hist = &state->hist;
 +        /* Set the "history lack" factor to 1 */
 +        state->flags |= (1<<estDISRE_INITF);
 +        hist->disre_initf = 1.0;
 +        /* Allocate space for the r^-3 time averages */
 +        state->flags |= (1<<estDISRE_RM3TAV);
 +        hist->ndisrepairs = dd->npair;
 +        snew(hist->disre_rm3tav,hist->ndisrepairs);
 +    }
 +    /* Allocate space for a copy of rm3tav,
 +     * so we can call do_force without modifying the state.
 +     */
 +    snew(dd->rm3tav,dd->npair);
 +
 +    /* Allocate Rt_6 and Rtav_6 consecutively in memory so they can be
 +     * averaged over the processors in one call (in calc_disre_R_6)
 +     */
 +    snew(dd->Rt_6,2*dd->nres);
 +    dd->Rtav_6 = &(dd->Rt_6[dd->nres]);
 +
 +    ptr = getenv("GMX_DISRE_ENSEMBLE_SIZE");
-         check_multi_int(fplog,cr->ms,dd->nsystems,
-                         "the number of systems per ensemble");
++    if (cr && cr->ms != NULL && ptr != NULL && !bIsREMD)
 +    {
 +#ifdef GMX_MPI
 +        dd->nsystems = 0;
 +        sscanf(ptr,"%d",&dd->nsystems);
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found GMX_DISRE_ENSEMBLE_SIZE set to %d systems per ensemble\n",dd->nsystems);
 +        }
++        /* This check is only valid on MASTER(cr), so probably
++         * ensemble-averaged distance restraints are broken on more
++         * than one processor per simulation system. */
++        if (MASTER(cr))
++        {
++            check_multi_int(fplog,cr->ms,dd->nsystems,
++                            "the number of systems per ensemble",
++                            FALSE);
++        }
++        gmx_bcast_sim(sizeof(int), &dd->nsystems, cr);
++
 +        /* We use to allow any value of nsystems which was a divisor
 +         * of ms->nsim. But this required an extra communicator which
 +         * was stored in t_fcdata. This pulled in mpi.h in nearly all C files.
 +         */
 +        if (!(cr->ms->nsim == 1 || cr->ms->nsim == dd->nsystems))
 +        {
 +            gmx_fatal(FARGS,"GMX_DISRE_ENSEMBLE_SIZE (%d) is not equal to 1 or the number of systems (option -multi) %d",dd->nsystems,cr->ms->nsim);
 +        }
-         if (cr && cr->ms)
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Our ensemble consists of systems:");
 +            for(i=0; i<dd->nsystems; i++)
 +            {
 +                fprintf(fplog," %d",
 +                        (cr->ms->sim/dd->nsystems)*dd->nsystems+i);
 +            }
 +            fprintf(fplog,"\n");
 +            }
 +        snew(dd->Rtl_6,dd->nres);
 +#endif
 +    }
 +    else
 +    {
 +        dd->nsystems = 1;
 +        dd->Rtl_6 = dd->Rt_6;
 +    }
 +    
 +    if (dd->npair > 0)
 +    {
 +        if (fplog) {
 +            fprintf(fplog,"There are %d distance restraints involving %d atom pairs\n",dd->nres,dd->npair);
 +        }
-                             "the number of distance restraints");
++        /* Have to avoid g_disre de-referencing cr blindly, mdrun not
++         * doing consistency checks for ensemble-averaged distance
++         * restraints when that's not happening, and only doing those
++         * checks from appropriate processes (since check_multi_int is
++         * too broken to check whether the communication will
++         * succeed...) */
++        if (cr && cr->ms && dd->nsystems > 1 && MASTER(cr))
 +        {
 +            check_multi_int(fplog,cr->ms,fcd->disres.nres,
++                            "the number of distance restraints",
++                            FALSE);
 +        }
 +        please_cite(fplog,"Tropp80a");
 +        please_cite(fplog,"Torda89a");
 +    }
 +}
 +
 +void calc_disres_R_6(const gmx_multisim_t *ms,
 +                     int nfa,const t_iatom forceatoms[],const t_iparams ip[],
 +                     const rvec x[],const t_pbc *pbc,
 +                     t_fcdata *fcd,history_t *hist)
 +{
 +    atom_id     ai,aj;
 +    int         fa,res,i,pair,ki,kj,m;
 +    int         type,npair,np;
 +    rvec        dx;
 +    real        *rt,*rm3tav,*Rtl_6,*Rt_6,*Rtav_6;
 +    real        rt_1,rt_3,rt2;
 +    ivec        it,jt,dt;
 +    t_disresdata *dd;
 +    real        ETerm,ETerm1,cf1=0,cf2=0,invn=0;
 +    gmx_bool        bTav;
 +
 +    dd = &(fcd->disres);
 +    bTav         = (dd->dr_tau != 0);
 +    ETerm        = dd->ETerm;
 +    ETerm1       = dd->ETerm1;
 +    rt           = dd->rt;
 +    rm3tav       = dd->rm3tav;
 +    Rtl_6        = dd->Rtl_6;
 +    Rt_6         = dd->Rt_6;
 +    Rtav_6       = dd->Rtav_6;
 +    
 +    if (bTav)
 +    {
 +        /* scaling factor to smoothly turn on the restraint forces *
 +         * when using time averaging                               */
 +        dd->exp_min_t_tau = hist->disre_initf*ETerm;
 +        
 +        cf1 = dd->exp_min_t_tau;
 +        cf2 = 1.0/(1.0 - dd->exp_min_t_tau);
 +    }
 +    
 +    if (dd->nsystems > 1)
 +    {
 +        invn = 1.0/dd->nsystems;
 +    }
 +    
 +    /* 'loop' over all atom pairs (pair_nr=fa/3) involved in restraints, *
 +     * the total number of atoms pairs is nfa/3                          */
 +    res = 0;
 +    fa  = 0;
 +    while (fa < nfa)
 +    {
 +        type  = forceatoms[fa];
 +        npair = ip[type].disres.npair;
 +        
 +        Rtav_6[res] = 0.0;
 +        Rt_6[res]   = 0.0;
 +        
 +        /* Loop over the atom pairs of 'this' restraint */
 +        np = 0;
 +        while (fa < nfa && np < npair)
 +        {
 +            pair = fa/3;
 +            ai   = forceatoms[fa+1];
 +            aj   = forceatoms[fa+2];
 +            
 +            if (pbc)
 +            {
 +                pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +            }
 +            else
 +            {
 +                rvec_sub(x[ai],x[aj],dx);
 +            }
 +            rt2  = iprod(dx,dx);
 +            rt_1 = gmx_invsqrt(rt2);
 +            rt_3 = rt_1*rt_1*rt_1;
 +            
 +            rt[pair]         = sqrt(rt2);
 +            if (bTav)
 +            {
 +                /* Here we update rm3tav in t_fcdata using the data
 +                 * in history_t.
 +                 * Thus the results stay correct when this routine
 +                 * is called multiple times.
 +                 */
 +                rm3tav[pair] = cf2*((ETerm - cf1)*hist->disre_rm3tav[pair] +
 +                                    ETerm1*rt_3);
 +            }
 +            else
 +            {
 +                rm3tav[pair] = rt_3;
 +            }
 +
 +            Rt_6[res]       += rt_3*rt_3;
 +            Rtav_6[res]     += rm3tav[pair]*rm3tav[pair];
 +
 +            fa += 3;
 +            np++;
 +        }
 +        if (dd->nsystems > 1)
 +        {
 +            Rtl_6[res]   = Rt_6[res];
 +            Rt_6[res]   *= invn;
 +            Rtav_6[res] *= invn;
 +        }
 +
 +        res++;
 +    }
 +    
 +#ifdef GMX_MPI
 +    if (dd->nsystems > 1)
 +    {
 +        gmx_sum_sim(2*dd->nres,Rt_6,ms);
 +    }
 +#endif
 +}
 +
 +real ta_disres(int nfa,const t_iatom forceatoms[],const t_iparams ip[],
 +               const rvec x[],rvec f[],rvec fshift[],
 +               const t_pbc *pbc,const t_graph *g,
 +               real lambda,real *dvdlambda,
 +               const t_mdatoms *md,t_fcdata *fcd,
 +               int *global_atom_index)
 +{
 +    const real sixth=1.0/6.0;
 +    const real seven_three=7.0/3.0;
 +    
 +    atom_id     ai,aj;
 +    int         fa,res,npair,p,pair,ki=CENTRAL,m;
 +    int         type;
 +    rvec        dx;
 +    real        weight_rt_1;
 +    real        smooth_fc,Rt,Rtav,rt2,*Rtl_6,*Rt_6,*Rtav_6;
 +    real        k0,f_scal=0,fmax_scal,fk_scal,fij;
 +    real        tav_viol,instant_viol,mixed_viol,violtot,vtot;
 +    real        tav_viol_Rtav7,instant_viol_Rtav7;
 +    real        up1,up2,low;
 +    gmx_bool        bConservative,bMixed,bViolation;
 +    ivec        it,jt,dt;
 +    t_disresdata *dd;
 +    int         dr_weighting;
 +    gmx_bool        dr_bMixed;
 +    
 +    dd = &(fcd->disres);
 +    dr_weighting = dd->dr_weighting;
 +    dr_bMixed    = dd->dr_bMixed;
 +    Rtl_6        = dd->Rtl_6;
 +    Rt_6         = dd->Rt_6;
 +    Rtav_6       = dd->Rtav_6;
 +
 +    tav_viol=instant_viol=mixed_viol=tav_viol_Rtav7=instant_viol_Rtav7=0;
 +
 +    smooth_fc = dd->dr_fc;
 +    if (dd->dr_tau != 0)
 +    {
 +        /* scaling factor to smoothly turn on the restraint forces *
 +         * when using time averaging                               */
 +        smooth_fc *= (1.0 - dd->exp_min_t_tau); 
 +    }
 +    
 +    violtot = 0;
 +    vtot    = 0;
 +    
 +    /* 'loop' over all atom pairs (pair_nr=fa/3) involved in restraints, *
 +     * the total number of atoms pairs is nfa/3                          */
 +    res  = 0;
 +    fa   = 0;
 +    while (fa < nfa)
 +    {
 +        type  = forceatoms[fa];
 +        /* Take action depending on restraint, calculate scalar force */
 +        npair = ip[type].disres.npair;
 +        up1   = ip[type].disres.up1;
 +        up2   = ip[type].disres.up2;
 +        low   = ip[type].disres.low;
 +        k0    = smooth_fc*ip[type].disres.kfac;
 +        
 +        /* save some flops when there is only one pair */
 +        if (ip[type].disres.type != 2)
 +        {
 +            bConservative = (dr_weighting == edrwConservative) && (npair > 1);
 +            bMixed        = dr_bMixed;
 +            Rt   = pow(Rt_6[res],-sixth);
 +            Rtav = pow(Rtav_6[res],-sixth);
 +        }
 +        else
 +        {
 +            /* When rtype=2 use instantaneous not ensemble avereged distance */
 +            bConservative = (npair > 1);
 +            bMixed        = FALSE;
 +            Rt   = pow(Rtl_6[res],-sixth);
 +            Rtav = Rt;
 +        }
 +        
 +        if (Rtav > up1)
 +        {
 +            bViolation = TRUE;
 +            tav_viol = Rtav - up1;
 +        }
 +        else if (Rtav < low)
 +        {
 +            bViolation = TRUE;
 +            tav_viol = Rtav - low;
 +        }
 +        else
 +        {
 +            bViolation = FALSE;
 +        }
 +        
 +        if (bViolation)
 +        {
 +            /* NOTE:
 +             * there is no real potential when time averaging is applied
 +             */
 +            vtot += 0.5*k0*sqr(tav_viol);
 +            if (1/vtot == 0)
 +            {
 +                printf("vtot is inf: %f\n",vtot);
 +            }
 +            if (!bMixed)
 +            {
 +                f_scal   = -k0*tav_viol;
 +                violtot += fabs(tav_viol);
 +            }
 +            else
 +            {
 +                if (Rt > up1)
 +                {
 +                    if (tav_viol > 0)
 +                    {
 +                        instant_viol = Rt - up1;
 +                    }
 +                    else
 +                    {
 +                        bViolation = FALSE;
 +                    }
 +                }
 +                else if (Rt < low)
 +                {
 +                    if (tav_viol < 0)
 +                    {
 +                        instant_viol = Rt - low;
 +                    }
 +                    else
 +                    {
 +                        bViolation = FALSE;
 +                    }
 +                }
 +                else
 +                {
 +                    bViolation = FALSE;
 +                }
 +                if (bViolation)
 +                {
 +                    mixed_viol = sqrt(tav_viol*instant_viol);
 +                    f_scal     = -k0*mixed_viol;
 +                    violtot   += mixed_viol;
 +                }
 +            }
 +        }
 +
 +        if (bViolation)
 +        {
 +            fmax_scal = -k0*(up2-up1);
 +            /* Correct the force for the number of restraints */
 +            if (bConservative)
 +            {
 +                f_scal  = max(f_scal,fmax_scal);
 +                if (!bMixed)
 +                {
 +                    f_scal *= Rtav/Rtav_6[res];
 +                }
 +                else
 +                {
 +                    f_scal /= 2*mixed_viol;
 +                    tav_viol_Rtav7     = tav_viol*Rtav/Rtav_6[res];
 +                    instant_viol_Rtav7 = instant_viol*Rt/Rt_6[res];
 +                }
 +            }
 +            else
 +            {
 +                f_scal /= (real)npair;
 +                f_scal  = max(f_scal,fmax_scal);
 +            }    
 +            
 +            /* Exert the force ... */
 +            
 +            /* Loop over the atom pairs of 'this' restraint */
 +            for(p=0; p<npair; p++)
 +            {
 +                pair = fa/3;
 +                ai   = forceatoms[fa+1];
 +                aj   = forceatoms[fa+2];
 +                
 +                if (pbc)
 +                {
 +                    ki = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +                }
 +                else
 +                {
 +                    rvec_sub(x[ai],x[aj],dx);
 +                }
 +                rt2 = iprod(dx,dx);
 +                
 +                weight_rt_1 = gmx_invsqrt(rt2);
 +                
 +                if (bConservative)
 +                {
 +                    if (!dr_bMixed)
 +                    {
 +                        weight_rt_1 *= pow(dd->rm3tav[pair],seven_three);
 +                    }
 +                    else
 +                    {
 +                        weight_rt_1 *= tav_viol_Rtav7*pow(dd->rm3tav[pair],seven_three)+
 +                            instant_viol_Rtav7*pow(dd->rt[pair],-7);
 +                    }
 +                }
 +                
 +                fk_scal  = f_scal*weight_rt_1;
 +                
 +                if (g)
 +                {
 +                    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +                    ki=IVEC2IS(dt);
 +                }
 +                
 +                for(m=0; m<DIM; m++)
 +                {
 +                    fij            = fk_scal*dx[m];
 +                    
 +                    f[ai][m]      += fij;
 +                    f[aj][m]      -= fij;
 +                    fshift[ki][m] += fij;
 +                    fshift[CENTRAL][m] -= fij;
 +                }
 +                fa += 3;
 +            }
 +        }
 +        else
 +        {
 +            /* No violation so force and potential contributions */
 +            fa += 3*npair;
 +        }
 +        res++;
 +    }
 +    
 +    dd->sumviol = violtot;
 +    
 +    /* Return energy */
 +    return vtot;
 +}
 +
 +void update_disres_history(t_fcdata *fcd,history_t *hist)
 +{
 +    t_disresdata *dd;
 +    int pair;
 +    
 +    dd = &(fcd->disres);
 +    if (dd->dr_tau != 0)
 +    {
 +        /* Copy the new time averages that have been calculated
 +         * in calc_disres_R_6.
 +         */
 +        hist->disre_initf = dd->exp_min_t_tau;
 +        for(pair=0; pair<dd->npair; pair++)
 +        {
 +            hist->disre_rm3tav[pair] = dd->rm3tav[pair];
 +        }
 +    }
 +}
index e7c2d65e1e198742921394b90671956468d54023,0000000000000000000000000000000000000000..150027514c99ebdb8529272c2c6d65f647b7b9e3
mode 100644,000000..100644
--- /dev/null
@@@ -1,619 -1,0 +1,623 @@@
-                                   "      Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-  
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROup of MAchos and Cynical Suckers
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +#include <string.h>
 +
 +#include "types/enums.h"
 +#include "types/hw_info.h"
 +#include "types/commrec.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "smalloc.h"
 +#include "gpu_utils.h"
 +#include "statutil.h"
 +#include "gmx_detect_hardware.h"
 +#include "main.h"
 +#include "md_logging.h"
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +#include "windows.h"
 +#endif
 +
 +/* Although we can't have more than 10 GPU different ID-s passed by the user as
 + * the id-s are assumed to be represented by single digits, as multiple
 + * processes can share a GPU, we can end up with more than 10 IDs.
 + * To account for potential extreme cases we'll set the limit to a pretty
 + * ridiculous number. */
 +static unsigned int max_gpu_ids_user = 64;
 +
 +static const char* invalid_gpuid_hint =
 +    "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
 +
 +/* FW decl. */
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 +
 +static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
 +{
 +    int      i, ndev;
 +    char     stmp[STRLEN];
 +
 +    ndev = gpu_info->ncuda_dev;
 +
 +    sbuf[0] = '\0';
 +    for (i = 0; i < ndev; i++)
 +    {
 +        get_gpu_device_info_string(stmp, gpu_info, i);
 +        strcat(sbuf, "  ");
 +        strcat(sbuf, stmp);
 +        if (i < ndev - 1)
 +        {
 +            strcat(sbuf, "\n");
 +        }
 +    }
 +}
 +
 +static void print_gpu_detection_stats(FILE *fplog,
 +                                      const gmx_gpu_info_t *gpu_info,
 +                                      const t_commrec *cr)
 +{
 +    char onhost[266],stmp[STRLEN];
 +    int  ngpu;
 +
 +    ngpu = gpu_info->ncuda_dev;
 +
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    /* We only print the detection on one, of possibly multiple, nodes */
 +    strncpy(onhost," on host ",10);
 +    gmx_gethostname(onhost+9,256);
 +#else
 +    /* We detect all relevant GPUs */
 +    strncpy(onhost,"",1);
 +#endif
 +
 +    if (ngpu > 0)
 +    {
 +        sprint_gpus(stmp, gpu_info, TRUE);
 +        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
 +                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
 +    }
 +    else
 +    {
 +        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
 +    }
 +}
 +
 +static void print_gpu_use_stats(FILE *fplog,
 +                                const gmx_gpu_info_t *gpu_info,
 +                                const t_commrec *cr)
 +{
 +    char sbuf[STRLEN], stmp[STRLEN];
 +    int  i, ngpu, ngpu_all;
 +
 +    ngpu     = gpu_info->ncuda_dev_use;
 +    ngpu_all = gpu_info->ncuda_dev;
 +
 +    /* Issue note if GPUs are available but not used */
 +    if (ngpu_all > 0 && ngpu < 1)
 +    {
 +        sprintf(sbuf,
 +                "%d compatible GPU%s detected in the system, but none will be used.\n"
 +                "Consider trying GPU acceleration with the Verlet scheme!",
 +                ngpu_all, (ngpu_all > 1) ? "s" : "");
 +    }
 +    else
 +    {
 +        sprintf(sbuf, "%d GPU%s %sselected for this run: ",
 +                ngpu, (ngpu > 1) ? "s" : "",
 +                gpu_info->bUserSet ? "user-" : "auto-");
 +        for (i = 0; i < ngpu; i++)
 +        {
 +            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
 +            if (i < ngpu - 1)
 +            {
 +                strcat(stmp, ", ");
 +            }
 +            strcat(sbuf, stmp);
 +        }
 +    }
 +    md_print_info(cr, fplog, "%s\n\n", sbuf);
 +}
 +
 +/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
 + * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
 +static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
 +{
 +    int  i;
 +    size_t len_idstr;
 +
 +    len_idstr = strlen(idstr);
 +
 +    if (len_idstr > max_gpu_ids_user)
 +    {
 +        gmx_fatal(FARGS,"%d GPU IDs provided, but only at most %d are supported",
 +                  len_idstr, max_gpu_ids_user);
 +    }
 +
 +    *nid = len_idstr;
 +
 +    for (i = 0; i < *nid; i++)
 +    {
 +        if (idstr[i] < '0' || idstr[i] > '9')
 +        {
 +            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
 +                      idstr[i], invalid_gpuid_hint);
 +        }
 +        idlist[i] = idstr[i] - '0';
 +    }
 +}
 +
 +static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
 +{
 +    /* XXX implement cvs format to support more than 10 different GPUs in a box. */
 +    gmx_incons("Not implemented yet");
 +}
 +
 +void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                                      const t_commrec *cr, int ntmpi_requested,
 +                                      gmx_bool bUseGPU)
 +{
 +    int      npppn, ntmpi_pp, ngpu;
 +    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 +    char     gpu_plural[2];
 +    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
 +
 +    assert(hwinfo);
 +    assert(cr);
 +
 +    btMPI = bMPI = FALSE;
 +    bNthreadsAuto = FALSE;
 +#if defined(GMX_THREAD_MPI)
 +    btMPI = TRUE;
 +    bNthreadsAuto = (ntmpi_requested < 1);
 +#elif defined(GMX_LIB_MPI)
 +    bMPI  = TRUE;
 +#endif
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
 +    /* GPU emulation detection is done later, but we need here as well
 +     * -- uncool, but there's no elegant workaround */
 +    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
 +    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        /* check the acceleration mdrun is compiled with against hardware capabilities */
 +        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
 +         *       Might not hurt to add an extra check over MPI. */
 +        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
 +    }
 +
 +    /* Below we only do consistency checks for PP and GPUs,
 +     * this is irrelevant for PME only nodes, so in that case we return here.
 +     */
 +    if (!(cr->duty & DUTY_PP))
 +    {
 +        return;
 +    }
 +
 +    /* Need to ensure that we have enough GPUs:
 +     * - need one GPU per PP node
 +     * - no GPU oversubscription with tMPI
 +     * => keep on the GPU support, otherwise turn off (or bail if forced)
 +     * */
 +    /* number of PP processes per node */
 +    npppn = cr->nrank_pp_intranode;
 +
 +    pernode[0] = '\0';
 +    th_or_proc_plural[0] = '\0';
 +    if (btMPI)
 +    {
 +        sprintf(th_or_proc, "thread-MPI thread");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "s");
 +        }
 +    }
 +    else if (bMPI)
 +    {
 +        sprintf(th_or_proc, "MPI process");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "es");
 +        }
 +        sprintf(pernode, " per node");
 +    }
 +    else
 +    {
 +        /* neither MPI nor tMPI */
 +        sprintf(th_or_proc, "process");
 +    }
 +
 +    if (bGPUBin)
 +    {
 +        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +
 +    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +
 +        /* number of tMPI threads atuo-adjusted */
 +        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
 +        {
 +            if (npppn < ngpu)
 +            {
 +                if (hwinfo->gpu_info.bUserSet)
 +                {
 +                    /* The user manually provided more GPUs than threads we could
 +                     * automatically start. */
 +                    gmx_fatal(FARGS,
 +                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 +                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
 +                              ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
 +                }
 +                else
 +                {
 +                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 +                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
 +                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
 +                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
 +
 +                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (ngpu != npppn)
 +        {
 +            if (hwinfo->gpu_info.bUserSet)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 +                          th_or_proc, btMPI ? "s" : "es" , pernode,
 +                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +            }
 +            else
 +            {
 +                if (ngpu > npppn)
 +                {
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
 +                                  "      PP %s%s%s than GPU%s available.\n"
-             same_count = 0;
++                                  "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
 +                                  ShortProgram(),
 +                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
 +                                  th_or_proc, npppn, gpu_plural, pernode);
 +
 +                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +                else
 +                {
 +                    /* Avoid duplicate error messages.
 +                     * Unfortunately we can only do this at the physical node
 +                     * level, since the hardware setup and MPI process count
 +                     * might be differ over physical nodes.
 +                     */
 +                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        gmx_fatal(FARGS,
 +                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
 +                                  th_or_proc, btMPI ? "s" : "es" , pernode,
 +                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +                    }
 +#ifdef GMX_MPI
 +                    else
 +                    {
 +                        /* Avoid other ranks to continue after inconsistency */
 +                        MPI_Barrier(cr->mpi_comm_mygroup);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +
++        hwinfo->gpu_info.bDevShare = FALSE;
 +        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
 +        {
 +            int i, j, same_count;
 +            gmx_bool bSomeSame, bAllDifferent;
 +
-                               "      multiple %s%s; this should be avoided as it generally\n"
-                               "      causes performance loss.",
++            same_count = 0; /* number of GPUs shared among ranks */
 +            bSomeSame = FALSE;
 +            bAllDifferent = TRUE;
 +
 +            for (i = 0; i < ngpu - 1; i++)
 +            {
 +                for (j = i + 1; j < ngpu; j++)
 +                {
 +                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
 +                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                }
 +            }
 +
++            /* store the number of shared/oversubscribed GPUs */
++            hwinfo->gpu_info.bDevShare = bSomeSame;
++
 +            if (btMPI && !bAllDifferent)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
 +                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
 +            }
 +
 +            if (bSomeSame)
 +            {
 +                md_print_warn(cr,fplog,
 +                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
++                              "      multiple %s%s; this should be avoided as it can cause\n"
++                              "      performance loss.\n",
 +                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
 +            }
 +        }
 +        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +}
 +
 +/* Return the number of hardware threads supported by the current CPU.
 + * We assume that this is equal with the number of CPUs reported to be
 + * online by the OS at the time of the call.
 + */
 +static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
 +{
 +     int ret = 0;
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +    /* Windows */
 +    SYSTEM_INFO sysinfo;
 +    GetSystemInfo( &sysinfo );
 +    ret = sysinfo.dwNumberOfProcessors;
 +#elif defined HAVE_SYSCONF
 +    /* We are probably on Unix.
 +     * Now check if we have the argument to use before executing the call
 +     */
 +#if defined(_SC_NPROCESSORS_ONLN)
 +    ret = sysconf(_SC_NPROCESSORS_ONLN);
 +#elif defined(_SC_NPROC_ONLN)
 +    ret = sysconf(_SC_NPROC_ONLN);
 +#elif defined(_SC_NPROCESSORS_CONF)
 +    ret = sysconf(_SC_NPROCESSORS_CONF);
 +#elif defined(_SC_NPROC_CONF)
 +    ret = sysconf(_SC_NPROC_CONF);
 +#endif /* End of check for sysconf argument values */
 +
 +#else
 +    /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
 +    ret = -1;
 +#endif
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Detected %d processors, will use this as the number "
 +                "of supported hardware threads.\n", ret);
 +    }
 +
 +#ifdef GMX_OMPENMP
 +    if (ret != gmx_omp_get_num_procs())
 +    {
 +        md_print_warn(cr, fplog,
 +                      "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
 +                      "Consider setting the launch configuration manually!",
 +                      ret, gmx_omp_get_num_procs());
 +    }
 +#endif
 +
 +    return ret;
 +}
 +
 +void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                         const t_commrec *cr,
 +                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
 +                         const char *gpu_id)
 +{
 +    int             i;
 +    const char      *env;
 +    char            sbuf[STRLEN], stmp[STRLEN];
 +    gmx_hw_info_t   *hw;
 +    gmx_gpu_info_t  gpuinfo_auto, gpuinfo_user;
 +    gmx_bool        bGPUBin;
 +
 +    assert(hwinfo);
 +
 +    /* detect CPUID info; no fuss, we don't detect system-wide
 +     * -- sloppy, but that's it for now */
 +    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
 +    }
 +
 +    /* detect number of hardware threads */
 +    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
 +
 +    /* detect GPUs */
 +    hwinfo->gpu_info.ncuda_dev_use  = 0;
 +    hwinfo->gpu_info.cuda_dev_use   = NULL;
 +    hwinfo->gpu_info.ncuda_dev      = 0;
 +    hwinfo->gpu_info.cuda_dev       = NULL;
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
 +    /* Bail if binary is not compiled with GPU acceleration, but this is either
 +     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
 +    if (bForceUseGPU && !bGPUBin)
 +    {
 +        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
 +    }
 +    if (gpu_id != NULL && !bGPUBin)
 +    {
 +        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
 +    }
 +
 +    /* run the detection if the binary was compiled with GPU support */
 +    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
 +    {
 +        char detection_error[STRLEN];
 +
 +        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
 +        {
 +            if (detection_error != NULL && detection_error[0] != '\0')
 +            {
 +                sprintf(sbuf, ":\n      %s\n", detection_error);
 +            }
 +            else
 +            {
 +                sprintf(sbuf, ".");
 +            }
 +            md_print_warn(cr, fplog,
 +                          "NOTE: Error occurred during GPU detection%s"
 +                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
 +                          sbuf);
 +        }
 +    }
 +
 +    if (bForceUseGPU || bTryUseGPU)
 +    {
 +        env = getenv("GMX_GPU_ID");
 +        if (env != NULL && gpu_id != NULL)
 +        {
 +            gmx_fatal(FARGS,"GMX_GPU_ID and -gpu_id can not be used at the same time");
 +        }
 +        if (env == NULL)
 +        {
 +            env = gpu_id;
 +        }
 +
 +        /* parse GPU IDs if the user passed any */
 +        if (env != NULL)
 +        {
 +            int *gpuid, *checkres;
 +            int nid, res;
 +
 +            snew(gpuid, max_gpu_ids_user);
 +            snew(checkres, max_gpu_ids_user);
 +
 +            parse_gpu_id_plain_string(env, &nid, gpuid);
 +
 +            if (nid == 0)
 +            {
 +                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
 +            }
 +
 +            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
 +
 +            if (!res)
 +            {
 +                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +
 +                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
 +                for (i = 0; i < nid; i++)
 +                {
 +                    if (checkres[i] != egpuCompatible)
 +                    {
 +                        sprintf(stmp, "    GPU #%d: %s\n",
 +                                gpuid[i], gpu_detect_res_str[checkres[i]]);
 +                        strcat(sbuf, stmp);
 +                    }
 +                }
 +                gmx_fatal(FARGS, "%s", sbuf);
 +            }
 +
 +            hwinfo->gpu_info.bUserSet = TRUE;
 +
 +            sfree(gpuid);
 +            sfree(checkres);
 +        }
 +        else
 +        {
 +            pick_compatible_gpus(&hwinfo->gpu_info);
 +            hwinfo->gpu_info.bUserSet = FALSE;
 +        }
 +
 +        /* decide whether we can use GPU */
 +        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
 +        if (!hwinfo->bCanUseGPU && bForceUseGPU)
 +        {
 +            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
 +        }
 +    }
 +}
 +
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
 +{
 +    int ndev_use;
 +
 +    assert(hwinfo);
 +
 +    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
 +
 +    if (count > ndev_use)
 +    {
 +        /* won't increase the # of GPUs */
 +        return;
 +    }
 +
 +    if (count < 1)
 +    {
 +        char sbuf[STRLEN];
 +        sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
 +                ndev_use, count);
 +        gmx_incons(sbuf);
 +    }
 +
 +    /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
 +    hwinfo->gpu_info.ncuda_dev_use = count;
 +}
 +
 +void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
 +{
 +    if (hwinfo)
 +    {
 +        gmx_cpuid_done(hwinfo->cpuid_info);
 +        free_gpu_info(&hwinfo->gpu_info);
 +        sfree(hwinfo);
 +    }
 +}
index 673025b7697dad412210da03097c2329f1b1cb78,0000000000000000000000000000000000000000..b5a5a7ace2967e619487a51dd288dd7a770c1857
mode 100644,000000..100644
--- /dev/null
@@@ -1,447 -1,0 +1,466 @@@
- void gmx_omp_nthreads_read_env(int *nthreads_omp)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <assert.h>
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "network.h"
 +#include "statutil.h"
 +#include "gmx_omp.h"
 +#include "gmx_omp_nthreads.h"
 +#include "md_logging.h"
 +
 +/** Structure with the number of threads for each OpenMP multi-threaded
 + *  algorithmic module in mdrun. */
 +typedef struct
 +{
 +    int gnth;               /**< Global num. of threads per PP or PP+PME process/tMPI thread. */
 +    int gnth_pme;           /**< Global num. of threads per PME only process/tMPI thread. */
 +
 +    int nth[emntNR];        /**< Number of threads for each module, indexed with module_nth_t */
 +    gmx_bool initialized;   /**< TRUE if the module as been initialized. */
 +} omp_module_nthreads_t;
 +
 +/** Names of environment variables to set the per module number of threads.
 + *
 + *  Indexed with the values of module_nth_t.
 + * */
 +static const char *modth_env_var[emntNR] =
 +{
 +    "GMX_DEFAULT_NUM_THREADS should never be set",
 +    "GMX_DOMDEC_NUM_THREADS", "GMX_PAIRSEARCH_NUM_THREADS",
 +    "GMX_NONBONDED_NUM_THREADS", "GMX_BONDED_NUM_THREADS",
 +    "GMX_PME_NUM_THREADS", "GMX_UPDATE_NUM_THREADS",
 +    "GMX_VSITE_NUM_THREADS",
 +    "GMX_LINCS_NUM_THREADS", "GMX_SETTLE_NUM_THREADS"
 +};
 +
 +/** Names of the modules. */
 +static const char *mod_name[emntNR] =
 +{
 +    "default", "domain decomposition", "pair search", "non-bonded",
 +    "bonded", "PME", "update", "LINCS", "SETTLE"
 +};
 +
 +/** Number of threads for each algorithmic module.
 + *
 + *  File-scope global variable that gets set once in \init_module_nthreads
 + *  and queried via gmx_omp_nthreads_get.
 + *
 + *  All fields are initialized to 0 which should result in errors if
 + *  the init call is omitted.
 + * */
 +static omp_module_nthreads_t modth = { 0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0}, FALSE};
 +
 +
 +/** Determine the number of threads for module \mod.
 + *
 + *  \m takes values form the module_nth_t enum and maps these to the
 + *  corresponding value in modth_env_var.
 + *
 + *  Each number of threads per module takes the default value unless
 + *  GMX_*_NUM_THERADS env var is set, case in which its value overrides
 + *  the deafult.
 + *
 + *  The "group" scheme supports OpenMP only in PME and in thise case all but
 + *  the PME nthread values default to 1.
 + */
 +static int pick_module_nthreads(FILE *fplog, int m,
 +                                gmx_bool bSimMaster,
 +                                gmx_bool bFullOmpSupport,
 +                                gmx_bool bSepPME)
 +{
 +    char *env;
 +    int  nth;
 +    char sbuf[STRLEN];
 +    gmx_bool bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
 +    /* The default should never be set through a GMX_*_NUM_THREADS env var
 +     * as it's always equal with gnth. */
 +    if (m == emntDefault)
 +    {
 +        return modth.nth[emntDefault];
 +    }
 +
 +    /* check the environment variable */
 +    if ((env = getenv(modth_env_var[m])) != NULL)
 +    {
 +        sscanf(env, "%d", &nth);
 +
 +        if (!bOMP)
 +        {
 +            gmx_warning("%s=%d is set, but %s is compiled without OpenMP!",
 +                        modth_env_var[m], nth, ShortProgram());
 +        }
 +
 +        /* with the verlet codepath, when any GMX_*_NUM_THREADS env var is set,
 +         * OMP_NUM_THREADS also has to be set */
 +        if (bFullOmpSupport && getenv("OMP_NUM_THREADS") == NULL)
 +        {
 +            gmx_fatal(FARGS, "%s=%d is set, the default number of threads also "
 +                      "needs to be set with OMP_NUM_THREADS!",
 +                      modth_env_var[m], nth);
 +        }
 +
 +        /* with the group scheme warn if any env var except PME is set */
 +        if (!bFullOmpSupport)
 +        {
 +            if (m != emntPME)
 +            {
 +                gmx_warning("%s=%d is set, but OpenMP multithreading is not "
 +                            "supported in %s!",
 +                            modth_env_var[m], nth, mod_name[m]);
 +                nth = 1;
 +            }
 +        }
 +
 +        /* only babble if we are really overriding with a different value */
 +        if ((bSepPME && m == emntPME && nth != modth.gnth_pme) || (nth != modth.gnth))
 +        {
 +            sprintf(sbuf, "%s=%d set, overriding the default number of %s threads",
 +                    modth_env_var[m], nth, mod_name[m]);
 +            if (bSimMaster)
 +            {
 +                fprintf(stderr, "\n%s\n", sbuf);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "%s\n", sbuf);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* pick the global PME node nthreads if we are setting the number
 +         * of threads in separate PME nodes  */
 +        nth = (bSepPME && m == emntPME) ? modth.gnth_pme : modth.gnth;
 +    }
 +
 +    return modth.nth[m] = nth;
 +}
 +
-         if (*nthreads_omp > 0 && nt_omp != *nthreads_omp)
++void gmx_omp_nthreads_read_env(int *nthreads_omp,
++                               gmx_bool bIsSimMaster)
 +{
 +    char *env;
++    gmx_bool bCommandLineSetNthreadsOMP = *nthreads_omp > 0;
++    char buffer[STRLEN];
 +
 +    assert(nthreads_omp);
 +
 +    if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +    {
 +        int nt_omp;
 +
 +        sscanf(env,"%d",&nt_omp);
 +        if (nt_omp <= 0)
 +        {
 +            gmx_fatal(FARGS,"OMP_NUM_THREADS is invalid: '%s'",env);
 +        }
 +
-             gmx_fatal(FARGS,"OMP_NUM_THREADS (%d) and the number of threads requested on the command line (%d) have different values",nt_omp,*nthreads_omp);
++        if (bCommandLineSetNthreadsOMP && nt_omp != *nthreads_omp)
 +        {
-         /* Setting the number of OpenMP threads.
-          * NOTE: with tMPI this function is only called on the master node,
-          * but with MPI on all nodes which means lots of messages on stderr.
-          */
-         fprintf(stderr,"Getting the number of OpenMP threads from OMP_NUM_THREADS: %d\n",nt_omp);
++            gmx_fatal(FARGS,"Environment variable OMP_NUM_THREADS (%d) and the number of threads requested on the command line (%d) have different values. Either omit one, or set them both to the same value.",nt_omp,*nthreads_omp);
 +        }
 +
++        /* Setting the number of OpenMP threads. */
 +        *nthreads_omp = nt_omp;
++
++        /* Output the results */
++        sprintf(buffer,
++                "The number of OpenMP threads was set by environment variable OMP_NUM_THREADS to %d%s\n",
++                nt_omp,
++                bCommandLineSetNthreadsOMP ? " (and the command-line setting agreed with that)" : "");
++        if (bIsSimMaster)
++        {
++            /* This prints once per simulation for multi-simulations,
++             * which might help diagnose issues with inhomogenous
++             * cluster setups. */
++            fputs(buffer, stderr);
++        }
++        if (debug)
++        {
++            /* This prints once per process for real MPI (i.e. once
++             * per debug file), and once per simulation for thread MPI
++             * (because of logic in the calling function). */
++            fputs(buffer, debug);
++        }
 +    }
 +}
 +
 +void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 +                           int nthreads_hw_avail,
 +                           int omp_nthreads_req,
 +                           int omp_nthreads_pme_req,
 +                           gmx_bool bThisNodePMEOnly,
 +                           gmx_bool bFullOmpSupport)
 +{
 +    int  nth, nth_pmeonly, gmx_maxth, nppn;
 +    char *env;
 +    gmx_bool bSepPME, bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
 +    /* number of MPI processes/threads per physical node */
 +    nppn = cr->nrank_intranode;
 +
 +    bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
 +              (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
 +
 +#ifdef GMX_THREAD_MPI
 +    /* modth is shared among tMPI threads, so for thread safety do the
 +     * detection is done on the master only. It is not thread-safe with
 +     * multiple simulations, but that's anyway not supported by tMPI. */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        /* just return if the initialization has already been done */
 +        if (modth.initialized)
 +        {
 +            return;
 +        }
 +
 +        /* With full OpenMP support (verlet scheme) set the number of threads
 +         * per process / default:
 +         * - 1 if not compiled with OpenMP or
 +         * - OMP_NUM_THREADS if the env. var is set, or
 +         * - omp_nthreads_req = #of threads requested by the user on the mdrun
 +         *   command line, otherwise
 +         * - take the max number of available threads and distribute them
 +         *   on the processes/tMPI threads.
 +         * ~ The GMX_*_NUM_THREADS env var overrides the number of threads of
 +         *   the respective module and it has to be used in conjunction with
 +         *   OMP_NUM_THREADS.
 +         *
 +         * With the group scheme OpenMP multithreading is only supported in PME,
 +         * for all other modules nthreads is set to 1.
 +         * The number of PME threads is equal to:
 +         * - 1 if not compiled with OpenMP or
 +         * - GMX_PME_NUM_THREADS if defined, otherwise
 +         * - OMP_NUM_THREADS if defined, otherwise
 +         * - 1
 +         */
 +        nth = 1;
 +        if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +        {
 +            if (!bOMP && (strncmp(env, "1", 1) != 0))
 +            {
 +                gmx_warning("OMP_NUM_THREADS is set, but %s was compiled without OpenMP support!",
 +                            ShortProgram());
 +            }
 +            else
 +            {
 +                nth = gmx_omp_get_max_threads();
 +            }
 +        }
 +        else if (omp_nthreads_req > 0)
 +        {
 +            nth = omp_nthreads_req;
 +        }
 +        else if (bFullOmpSupport && bOMP)
 +        {
 +            /* max available threads per node */
 +            nth = nthreads_hw_avail;
 +
 +            /* divide the threads among the MPI processes/tMPI threads */
 +            if (nth >= nppn)
 +            {
 +                nth /= nppn;
 +            }
 +            else
 +            {
 +                nth = 1;
 +            }
 +        }
 +
 +        /* now we have the global values, set them:
 +         * - 1 if not compiled with OpenMP and for the group scheme
 +         * - nth for the verlet scheme when compiled with OpenMP
 +         */
 +        if (bFullOmpSupport && bOMP)
 +        {
 +            modth.gnth = nth;
 +        }
 +        else
 +        {
 +            modth.gnth = 1;
 +        }
 +
 +        if (bSepPME)
 +        {
 +            if (omp_nthreads_pme_req > 0)
 +            {
 +                modth.gnth_pme = omp_nthreads_pme_req;
 +            }
 +            else
 +            {
 +                modth.gnth_pme = nth;
 +            }
 +        }
 +        else
 +        {
 +            modth.gnth_pme = 0;
 +        }
 +
 +        /* now set the per-module values */
 +        modth.nth[emntDefault] = modth.gnth;
 +        pick_module_nthreads(fplog, emntDomdec, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPairsearch, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntNonbonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntBonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPME, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntUpdate, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntVSITE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntLINCS, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntSETTLE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +
 +        /* set the number of threads globally */
 +        if (bOMP)
 +        {
 +#ifndef GMX_THREAD_MPI
 +            if (bThisNodePMEOnly)
 +            {
 +                gmx_omp_set_num_threads(modth.gnth_pme);
 +            }
 +            else
 +#endif /* GMX_THREAD_MPI */
 +            {
 +                if (bFullOmpSupport)
 +                {
 +                    gmx_omp_set_num_threads(nth);
 +                }
 +                else
 +                {
 +                    gmx_omp_set_num_threads(1);
 +                }
 +            }
 +        }
 +
 +        modth.initialized = TRUE;
 +    }
 +#ifdef GMX_THREAD_MPI
 +    /* Non-master threads have to wait for the detection to be done. */
 +    if (PAR(cr))
 +    {
 +        MPI_Barrier(cr->mpi_comm_mysim);
 +    }
 +#endif
 +
 +    /* inform the user about the settings */
 +    if (bOMP)
 +    {
 +#ifdef GMX_THREAD_MPI
 +        const char *mpi_str="per tMPI thread";
 +#else
 +        const char *mpi_str="per MPI process";
 +#endif
 +
 +        /* for group scheme we print PME threads info only */
 +        if (bFullOmpSupport)
 +        {
 +            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s\n",
 +                          modth.gnth,modth.gnth > 1 ? "s" : "",
 +                          cr->nnodes > 1 ? mpi_str : "");
 +        }
 +        if (bSepPME && modth.gnth_pme != modth.gnth)
 +        {
 +            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s for PME\n",
 +                          modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
 +                          cr->nnodes > 1 ? mpi_str : "");
 +        }
 +    }
 +
 +    /* detect and warn about oversubscription
 +     * TODO: enable this for separate PME nodes as well! */
 +    if (!bSepPME && cr->rank_pp_intranode == 0)
 +    {
 +        char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
 +
 +        if (modth.gnth*nppn > nthreads_hw_avail)
 +        {
 +            sprintf(sbuf, "threads");
 +            sbuf1[0] = '\0';
 +            sprintf(sbuf2, "O");
 +#ifdef GMX_MPI
 +            if (modth.gnth == 1)
 +            {
 +#ifdef GMX_THREAD_MPI
 +                sprintf(sbuf, "thread-MPI threads");
 +#else
 +                sprintf(sbuf, "MPI processes");
 +                sprintf(sbuf1, " per node");
 +                sprintf(sbuf2, "On node %d: o", cr->sim_nodeid);
 +#endif
 +            }
 +#endif
 +            md_print_warn(cr, fplog,
 +                          "WARNING: %sversubscribing the available %d logical CPU cores%s with %d %s.\n"
 +                          "         This will cause considerable performance loss!",
 +                          sbuf2, nthreads_hw_avail, sbuf1, nppn*modth.gnth, sbuf);
 +        }
 +    }
 +}
 +
 +int gmx_omp_nthreads_get(int mod)
 +{
 +    if (mod < 0 || mod >= emntNR)
 +    {
 +        /* invalid module queried */
 +        return -1;
 +    }
 +    else
 +    {
 +        return modth.nth[mod];
 +    }
 +}
index c29b74dff18465f448d0084bae4d465e5745645d,0000000000000000000000000000000000000000..8ebb232db7e4a83700ca162bd96a84b42b46e307
mode 100644,000000..100644
--- /dev/null
@@@ -1,962 -1,0 +1,868 @@@
-  * Copyright (c) 2001-2010, The GROMACS development team,
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- /*! Number of supported GPUs */
- #define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
++ * Copyright (c) 2001-2010,2012 The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "types/hw_info.h"
 +
 +#include "gpu_utils.h"
 +#include "../cuda_tools/cudautils.cuh"
 +#include "memtestG80_core.h"
 +
 +
 +#define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
 +#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
 +                                                                            to run in quick memtest. */
 +#define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
 +
 +#define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
 +#define FULL_ITER       25 /*!< Number of iterations in full memtest. */
 +
 +#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
 +                                                                            run in time constrained memtest. */
 +
- // TODO put this list into an external file and include it so that the list is easily accessible
- /*! List of supported GPUs. */
- static const char * const SupportedGPUs[] = {
-     /* GT400 */
-     "Geforce GTX 480",
-     "Geforce GTX 470",
-     "Geforce GTX 465",
-     "Geforce GTX 460",
-     "Tesla C2070",
-     "Tesla C2050",
-     "Tesla S2070",
-     "Tesla S2050",
-     "Tesla M2070",
-     "Tesla M2050",
-     "Quadro 5000",
-     "Quadro 6000",
-     /* GT200 */
-     "Geforce GTX 295",
-     "Geforce GTX 285",
-     "Geforce GTX 280",
-     "Geforce GTX 275",
-     "Geforce GTX 260",
-     "GeForce GTS 250",
-     "GeForce GTS 150",
-     "GeForce GTX 285M",
-     "GeForce GTX 280M",
-     "Tesla S1070",
-     "Tesla C1060",
-     "Tesla M1060",
-     "Quadro FX 5800",
-     "Quadro FX 4800",
-     "Quadro CX",
-     "Quadro Plex 2200 D2",
-     "Quadro Plex 2200 S4",
-     /* G90 */
-     "GeForce 9800 G", /* GX2, GTX, GTX+, GT */
-     "GeForce 9800M GTX",
-     "Quadro FX 4700",
-     "Quadro Plex 2100 D4"
- };
 +static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
 +                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
 +
 +/*! Dummy kernel used for sanity checking. */
 +__global__ void k_dummy_test(){}
 +
 +
 +/*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
 +enum memtest_G80_test_types {
 +    MOVING_INVERSIONS_10 =      0x1,
 +    MOVING_INVERSIONS_RAND =    0x2,
 +    WALKING_8BIT_M86 =          0x4,
 +    WALKING_0_8BIT =            0x8,
 +    WALKING_1_8BIT =            0x10,
 +    WALKING_0_32BIT =           0x20,
 +    WALKING_1_32BIT =           0x40,
 +    RANDOM_BLOCKS =             0x80,
 +    MOD_20_32BIT =              0x100,
 +    LOGIC_1_ITER =              0x200,
 +    LOGIC_4_ITER =              0x400,
 +    LOGIC_1_ITER_SHMEM =        0x800,
 +    LOGIC_4_ITER_SHMEM =        0x1000
 +};
 +
- /*! 
-  * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
-  * 
-  * \param[in] gpu_name  the name of the CUDA device
-  * \returns             TRUE if the device is supported, otherwise FALSE
-  */
- static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
- {
-     size_t i;
-     for (i = 0; i < NB_GPUS; i++)
-     {
-         trim(gpuName);
-         if (gmx_strncasecmp(gpuName, SupportedGPUs[i], strlen(SupportedGPUs[i])) == 0)
-             return 1;
-     }
-     return 0;
- }
- /*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
-  *
-  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
-  * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set. 
-  * \returns             TRUE if the device is supported, otherwise FALSE
-  * 
-  */
- gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
- {
-     cudaDeviceProp dev_prop;
-     if (debug) fprintf(debug, "Checking compatibility with device #%d, %s\n", dev_id, gpu_name);
-     if (do_sanity_checks(dev_id, &dev_prop) != 0)
-         return -1;
-     if (gpu_name != NULL)
-     { 
-         strcpy(gpu_name, dev_prop.name);
-     }
-     return is_gmx_openmm_supported_gpu_name(dev_prop.name);
- }
 +
 +/*! 
 +  * \brief Runs GPU sanity checks.
 +  *
 +  * Runs a series of checks to determine that the given GPU and underlying CUDA
 +  * driver/runtime functions properly.
 +  * Returns properties of a device with given ID or the one that has
 +  * already been initialized earlier in the case if of \dev_id == -1.
 +  *
 +  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 +  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 +  * \returns                0 if the device looks OK
 +  *
 +  * TODO: introduce errors codes and handle errors more smoothly.
 +  */
 +static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t cu_err;
 +    int         dev_count, id;
 +
 +    cu_err = cudaGetDeviceCount(&dev_count);
 +    if (cu_err != cudaSuccess)
 +    {
 +       fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 +               cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* no CUDA compatible device at all */
 +    if (dev_count == 0)
 +        return -1;
 +
 +    /* things might go horribly wrong if cudart is not compatible with the driver */
 +    if (dev_count < 0 || dev_count > cuda_max_device_count)
 +        return -1;
 +
 +    if (dev_id == -1) /* device already selected let's not destroy the context */
 +    {
 +        cu_err = cudaGetDevice(&id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 +                    cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +    else
 +    {
 +        id = dev_id;
 +        if (id > dev_count - 1) /* pfff there's no such device */
 +        {
 +            fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 +                    dev_id, dev_count);
 +            return -1;
 +        }
 +    }
 +
 +    memset(dev_prop, 0, sizeof(cudaDeviceProp));
 +    cu_err = cudaGetDeviceProperties(dev_prop, id);
 +    if (cu_err != cudaSuccess)
 +    {
 +        fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 +                cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* both major & minor is 9999 if no CUDA capable devices are present */
 +    if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 +        return -1;
 +    /* we don't care about emulation mode */
 +    if (dev_prop->major == 0)
 +        return -1;
 +
 +    if (id != -1)
 +    {
 +        cu_err = cudaSetDevice(id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 +                    cu_err, id, cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +
 +    /* try to execute a dummy kernel */
 +    k_dummy_test<<<1, 512>>>();
 +    if (cudaThreadSynchronize() != cudaSuccess)
 +    {
 +        return -1;
 +    }
 +
 +    /* destroy context if we created one */
 +    if (id != -1)
 +    {
 +#if CUDA_VERSION < 4000
 +        cu_err = cudaThreadExit();
 +        CU_RET_ERR(cu_err, "cudaThreadExit failed");
 +#else
 +        cu_err = cudaDeviceReset();
 +        CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 +#endif
 +    }
 +
 +    return 0;
 +}
 +
 +
 +/*!
 + * \brief Runs a set of memory tests specified by the given bit-flags.
 + * Tries to allocate and do the test on \p megs Mb memory or
 + * the greatest amount that can be allocated (>10Mb).
 + * In case if an error is detected it stops without finishing the remaining
 + * steps/iterations and returns greater then zero value.
 + * In case of other errors (e.g. kernel launch errors, device querying errors)
 + * -1 is returned.
 + *
 + * \param[in] which_tests   variable with bit-flags of the requested tests
 + * \param[in] megs          amount of memory that will be tested in MB
 + * \param[in] iter          number of iterations
 + * \returns                 0 if no error was detected, otherwise >0
 + */
 +static int do_memtest(unsigned int which_tests, int megs, int iter)
 +{
 +    memtestState    tester;
 +    int             i;
 +    uint            err_count; //, err_iter;
 +
 +    // no parameter check as this fn won't be called externally
 +
 +    // let's try to allocate the mem
 +    while (!tester.allocate(megs) && (megs - 10 > 0))
 +        { megs -= 10; tester.deallocate(); }
 +
 +    if (megs <= 10)
 +    {
 +        fprintf(stderr, "Unable to allocate GPU memory!\n");
 +        return -1;
 +    }
 +
 +    // clear the first 18 bits
 +    which_tests &= 0x3FFF;
 +    for (i = 0; i < iter; i++)
 +    {
 +        // Moving Inversions (ones and zeros)
 +        if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 +        {
 +            tester.gpuMovingInversionsOnesZeros(err_count);
 +            if (err_count > 0)
 +                return MOVING_INVERSIONS_10;
 +        }
 +        // Moving Inversions (random)
 +        if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 +        {
 +            tester.gpuMovingInversionsRandom(err_count);
 +            if (err_count > 0)
 +                return MOVING_INVERSIONS_RAND;
 +        }
 +       // Memtest86 Walking 8-bit
 +        if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8BitM86(err_count, shift);
 +                if (err_count > 0)
 +                    return WALKING_8BIT_M86;
 +            }
 +      }
 +        // True Walking zeros (8-bit)
 +        if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8Bit(err_count, false, shift);
 +                if (err_count > 0)
 +                    return WALKING_0_8BIT;
 +            }
 +        }
 +        // True Walking ones (8-bit)
 +        if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8Bit(err_count, true, shift);
 +                if (err_count > 0)
 +                    return WALKING_1_8BIT;
 +            }
 +        }
 +        // Memtest86 Walking zeros (32-bit)
 +        if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 +        {
 +            for (uint shift = 0; shift < 32; shift++)
 +            {
 +                tester.gpuWalking32Bit(err_count, false, shift);
 +                if (err_count > 0)
 +                    return WALKING_0_32BIT;
 +            }
 +        }
 +       // Memtest86 Walking ones (32-bit)
 +        if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 +        {
 +            for (uint shift = 0; shift < 32; shift++)
 +            {
 +                tester.gpuWalking32Bit(err_count, true, shift);
 +                if (err_count > 0)
 +                    return WALKING_1_32BIT;
 +            }
 +       }
 +        // Random blocks
 +        if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 +        {
 +            tester.gpuRandomBlocks(err_count,rand());
 +            if (err_count > 0)
 +                return RANDOM_BLOCKS;
 +
 +        }
 +
 +        // Memtest86 Modulo-20
 +        if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 +        {
 +            for (uint shift = 0; shift < 20; shift++)
 +            {
 +                tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 +                if (err_count > 0)
 +                    return MOD_20_32BIT;
 +            }
 +        }
 +        // Logic (one iteration)
 +        if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 +        {
 +            tester.gpuShortLCG0(err_count,1);
 +            if (err_count > 0)
 +                return LOGIC_1_ITER;
 +        }
 +        // Logic (4 iterations)
 +        if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 +        {
 +            tester.gpuShortLCG0(err_count,4);
 +            if (err_count > 0)
 +                return LOGIC_4_ITER;
 +
 +        }
 +        // Logic (shared memory, one iteration)
 +        if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 +        {
 +            tester.gpuShortLCG0Shmem(err_count,1);
 +            if (err_count > 0)
 +                return LOGIC_1_ITER_SHMEM;
 +        }
 +        // Logic (shared-memory, 4 iterations)
 +        if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 +        {
 +            tester.gpuShortLCG0Shmem(err_count,4);
 +            if (err_count > 0)
 +                return LOGIC_4_ITER_SHMEM;
 +        }
 +    }
 +
 +    tester.deallocate();
 +    return err_count;
 +}
 +
 +/*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 + * If an error is detected it stops before completing the test and returns a
 + * value greater then 0. In case of other errors (e.g. kernel launch errors,
 + * device querying errors) -1 is returned.
 + *
 + * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 + * \returns             0 if no error was detected, otherwise >0
 + */
 +int do_quick_memtest(int dev_id)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res, time=0;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    if (debug)
 +    {
 +        devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +        fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 +            QUICK_MEM, devmem, QUICK_ITER);
 +    }
 +
 +    res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Q-RES = %d\n", res);
 +        fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id !=-1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 + * If an error is detected  it stops before completing the test and returns a
 + * value greater then 0. In case of other errors (e.g. kernel launch errors,
 + * device querying errors) -1 is returned.
 + *
 + * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 + * \returns             0 if no error was detected, otherwise >0
 + */
 +
 +int do_full_memtest(int dev_id)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res, time=0;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +
 +    if (debug) 
 +    { 
 +        fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 +            devmem, devmem, FULL_ITER); 
 +    }
 +
 +    /* do all test on the entire memory */
 +    res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "F-RES = %d\n", res);
 +        fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id != -1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 + * If an error is detected it stops before completing the test and returns a value greater
 + * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 + * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 + * the test will always be multipple of one iteration's runtime.
 + *
 + * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 + * \param[in] time_constr   the time limit of the testing
 + * \returns                 0 if no error was detected, otherwise >0
 + */
 +int do_timed_memtest(int dev_id, int time_constr)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res=0, time=0, startt;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    time_constr *= 1000;  /* convert to ms for convenience */
 +    startt = getTimeMilliseconds();
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +
 +    if (debug) 
 +    { 
 +        fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 +        devmem, devmem, time_constr); 
 +    }
 +
 +    /* do the TIMED_TESTS set, one step at a time on the entire memory 
 +       that can be allocated, and stop when the given time is exceeded */
 +    while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 +    {        
 +        res = do_memtest(TIMED_TESTS, devmem, 1);
 +        if (res != 0) break;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "T-RES = %d\n", res);
 +        fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id != -1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Initializes the GPU with the given index.
 + *
 + * The varible \mygpu is the index of the GPU to initialize in the
 + * gpu_info.cuda_dev array.
 + *
 + * \param[in]  mygpu        index of the GPU to initialize
 + * \param[out] result_str   the message related to the error that occurred
 + *                          during the initialization (if there was any).
 + * \param[in] gpu_info      GPU info of all detected devices in the system.
 + * \returns                 true if no error occurs during initialization.
 + */
 +gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 +{
 +    cudaError_t stat;
 +    char sbuf[STRLEN];
 +    int gpuid;
 +
 +    assert(gpu_info);
 +    assert(result_str);
 +
 +    if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
 +    {
 +        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 +                "there are %d %s-selected GPU(s), but #%d was requested.",
 +                 gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
 +        gmx_incons(sbuf);
 +    }
 +
 +    gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 +
 +    stat = cudaSetDevice(gpuid);
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +
 +    if (debug)
 +    {
 +        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 +    }
 +
 +    return (stat == cudaSuccess);
 +}
 +
 +/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 + *
 + * The context is explicitly destroyed and therefore all data uploaded to the GPU
 + * is lost. This should only be called when none of this data is required anymore.
 + *
 + * \param[out] result_str   the message related to the error that occurred
 + *                          during the initialization (if there was any).
 + * \returns                 true if no error occurs during the freeing.
 + */
 +gmx_bool free_gpu(char *result_str)
 +{
 +    cudaError_t stat;
 +
 +    assert(result_str);
 +
 +    if (debug)
 +    {
 +        int gpuid;
 +        stat = cudaGetDevice(&gpuid);
 +        CU_RET_ERR(stat, "cudaGetDevice failed");
 +        fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 +    }
 +
 +#if CUDA_VERSION < 4000
 +    stat = cudaThreadExit();
 +#else
 +    stat = cudaDeviceReset();
 +#endif
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +
 +    return (stat == cudaSuccess);
 +}
 +
 +/*! \brief Returns true if the gpu characterized by the device properties is
 + *  supported by the native gpu acceleration.
 + *
 + * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 + * \returns             true if the GPU properties passed indicate a compatible
 + *                      GPU, otherwise false.
 + */
 +static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 +{
 +    return (dev_prop->major >= 2);
 +}
 +
 +/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 + *
 + * \param[in] stat  GPU status.
 + * \returns         true if the provided status is egpuCompatible, otherwise false.
 + */
 +static bool is_compatible_gpu(int stat)
 +{
 +    return (stat == egpuCompatible);
 +}
 +
 +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 + *
 + *  Returns a status value which indicates compatibility or one of the following
 + *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 + *  It also returns the respective device's properties in \dev_prop (if applicable).
 + *
 + *  \param[in]  dev_id   the ID of the GPU to check.
 + *  \param[out] dev_prop the CUDA device properties of the device checked.
 + *  \returns             the status of the requested device
 + */
 +static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t stat;
 +    int         ndev;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        return egpuInsane;
 +    }
 +
 +    if (dev_id > ndev - 1)
 +    {
 +        return egpuNonexistent;
 +    }
 +
 +    /* TODO: currently we do not make a distinction between the type of errors
 +     * that can appear during sanity checks. This needs to be improved, e.g if
 +     * the dummy test kernel fails to execute with a "device busy message" we
 +     * should appropriately report that the device is busy instead of insane.
 +     */
 +    if (do_sanity_checks(dev_id, dev_prop) == 0)
 +    {
 +        if (is_gmx_supported_gpu(dev_prop))
 +        {
 +            return egpuCompatible;
 +        }
 +        else
 +        {
 +            return egpuIncompatible;
 +        }
 +    }
 +    else
 +    {
 +        return egpuInsane;
 +    }
 +}
 +
 +
 +/*! \brief Detect all NVIDIA GPUs in the system.
 + *
 + *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 + *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 + *  with the required information on each the device: ID, device properties,
 + *  status.
 + *
 + *  \param[in] gpu_info    pointer to structure holding GPU information.
 + *  \param[out] err_str    The error message of any CUDA API error that caused
 + *                         the detection to fail (if there was any). The memory
 + *                         the pointer points to should be managed externally.
 + *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 + */
 +int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 +{
 +    int             i, ndev, checkres, retval;
 +    cudaError_t     stat;
 +    cudaDeviceProp  prop;
 +    cuda_dev_info_t *devs;
 +
 +    assert(gpu_info);
 +    assert(err_str);
 +
 +    ndev    = 0;
 +    devs    = NULL;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        const char *s;
 +
 +        /* cudaGetDeviceCount failed which means that there is something
 +         * wrong with the machine: driver-runtime mismatch, all GPUs being
 +         * busy in exclusive mode, or some other condition which should
 +         * result in us issuing a warning a falling back to CPUs. */
 +        retval = -1;
 +        s = cudaGetErrorString(stat);
 +        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 +    }
 +    else
 +    {
 +        snew(devs, ndev);
 +        for (i = 0; i < ndev; i++)
 +        {
 +            checkres = is_gmx_supported_gpu_id(i, &prop);
 +
 +            devs[i].id   = i;
 +            devs[i].prop = prop;
 +            devs[i].stat = checkres;
 +        }
 +        retval = 0;
 +    }
 +
 +    gpu_info->ncuda_dev = ndev;
 +    gpu_info->cuda_dev  = devs;
 +
 +    return retval;
 +}
 +
 +/*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 + *
 + * This function selects the compatible gpus and initializes
 + * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 + *
 + * Given the list of GPUs available in the system the it checks each gpu in
 + * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
 + * the compatible ones into cuda_dev_use with this marking the respective
 + * GPUs as "available for use."
 + * Note that \detect_cuda_gpus must have been called before.
 + *
 + * \param[in]    gpu_info    pointer to structure holding GPU information
 + */
 +void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 +{
 +    int i, ncompat;
 +    int *compat;
 +
 +    assert(gpu_info);
 +    /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 +    assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 +
 +    snew(compat, gpu_info->ncuda_dev);
 +    ncompat = 0;
 +    for (i = 0; i < gpu_info->ncuda_dev; i++)
 +    {
 +        if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 +        {
 +            ncompat++;
 +            compat[ncompat - 1] = i;
 +        }
 +    }
 +
 +    gpu_info->ncuda_dev_use = ncompat;
 +    snew(gpu_info->cuda_dev_use, ncompat);
 +    memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
 +    sfree(compat);
 +}
 +
 +/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 + *
 + * Given the a list of GPU devide IDs in \requested_devs, check for the
 + * existence and compatibility of the respective GPUs and fill in \gpu_info
 + * with the collected information. Also provide the caller with an array with
 + * the result of checks in \checkres.
 + *
 + * \param[out]  checkres    check result for each ID passed in \requested_devs
 + * \param[in]   gpu_info    pointer to structure holding GPU information
 + * \param[in]   requested_devs array of requested device IDs
 + * \param[in]   count       number of IDs in \requested_devs
 + * \returns                 TRUE if every requested GPU is compatible
 + */
 +gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 +                                const int *requested_devs, int count)
 +{
 +    int i, id;
 +    bool bAllOk;
 +
 +    assert(checkres);
 +    assert(gpu_info);
 +    assert(requested_devs);
 +    assert(count >= 0);
 +
 +    if (count == 0)
 +    {
 +        return TRUE;
 +    }
 +
 +    /* we will assume that all GPUs requested are valid IDs,
 +       otherwise we'll bail anyways */
 +    gpu_info->ncuda_dev_use = count;
 +    snew(gpu_info->cuda_dev_use, count);
 +
 +    bAllOk = true;
 +    for (i = 0; i < count; i++)
 +    {
 +        id = requested_devs[i];
 +
 +        /* devices are stored in increasing order of IDs in cuda_dev */
 +        gpu_info->cuda_dev_use[i] = id;
 +
 +        checkres[i] = (id >= gpu_info->ncuda_dev) ?
 +            egpuNonexistent : gpu_info->cuda_dev[id].stat;
 +
 +        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 +    }
 +
 +    return bAllOk;
 +}
 +
 +/*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 + *
 + * \param[in]    gpu_info    pointer to structure holding GPU information
 + */
 +void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 +{
 +    if (gpu_info == NULL)
 +    {
 +        return;
 +    }
 +
 +    sfree(gpu_info->cuda_dev_use);
 +    sfree(gpu_info->cuda_dev);
 +}
 +
 +/*! \brief Formats and returns a device information string for a given GPU.
 + *
 + * Given an index *directly* into the array of available GPUs (cuda_dev)
 + * returns a formatted info string for the respective GPU which includes
 + * ID, name, compute capability, and detection status.
 + *
 + * \param[out]  s           pointer to output string (has to be allocated externally)
 + * \param[in]   gpu_info    pointer to structure holding GPU information
 + * \param[in]   index       an index *directly* into the array of available GPUs
 + */
 +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 +{
 +    assert(s);
 +    assert(gpu_info);
 +
 +    if (index < 0 && index >= gpu_info->ncuda_dev)
 +    {
 +        return;
 +    }
 +
 +    cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 +
 +    bool bGpuExists =
 +        dinfo->stat == egpuCompatible ||
 +        dinfo->stat == egpuIncompatible;
 +
 +    if (!bGpuExists)
 +    {
 +        sprintf(s, "#%d: %s, stat: %s",
 +                dinfo->id, "N/A",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +    else
 +    {
 +        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 +                dinfo->id, dinfo->prop.name,
 +                dinfo->prop.major, dinfo->prop.minor,
 +                dinfo->prop.ECCEnabled ? "yes" : " no",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +}
 +
 +/*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 + *
 + * Getter function which, given an index into the array of GPUs in use
 + * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 + * respective CUDA GPU.
 + *
 + * \param[in]    gpu_info   pointer to structure holding GPU information
 + * \param[in]    idx        index into the array of used GPUs
 + * \returns                 device ID of the requested GPU
 + */
 +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
 +{
 +    assert(gpu_info);
 +    if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
 +    {
 +        return -1;
 +    }
 +
 +    return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 +}
 +
 +/*! \brief Returns the device ID of the GPU currently in use.
 + *
 + * The GPU used is the one that is active at the time of the call in the active context.
 + *
 + * \param[in]    gpu_info   pointer to structure holding GPU information
 + * \returns                 device ID of the GPU in use at the time of the call
 + */
 +int get_current_gpu_device_id(void)
 +{
 +    int gpuid;
 +    CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 +
 +    return gpuid;
 +}
index 9a2be229fe2fb115feb62764182520abad476773,0000000000000000000000000000000000000000..b76cac81fea6813e6369144d9135afff72424ca3
mode 100644,000000..100644
--- /dev/null
@@@ -1,1911 -1,0 +1,1495 @@@
- #ifdef GMX_FORTRAN
- /* NOTE: DO NOT USE THESE ANYWHERE IN GROMACS ITSELF. 
-    These are necessary for the backward-compatile io routines for 3d party
-    tools */
- #define MAXID 256
- static FILE *xdrfiles[MAXID];
- static XDR *xdridptr[MAXID];
- static char xdrmodes[MAXID];
- static unsigned int cnt;
- #ifdef GMX_THREAD_MPI
- /* we need this because of the global variables above for FORTRAN binding. 
-    The I/O operations are going to be slow. */
- static tMPI_Thread_mutex_t xdr_fortran_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
- #endif
- static void xdr_fortran_lock(void)
- {
- #ifdef GMX_THREAD_MPI
-     tMPI_Thread_mutex_lock(&xdr_fortran_mutex);
- #endif
- }
- static void xdr_fortran_unlock(void)
- {
- #ifdef GMX_THREAD_MPI
-     tMPI_Thread_mutex_unlock(&xdr_fortran_mutex);
- #endif
- }
- /* the open&close prototypes */
- static int xdropen(XDR *xdrs, const char *filename, const char *type);
- static int xdrclose(XDR *xdrs);
- typedef void (* F77_FUNC(xdrfproc,XDRFPROC))(int *, void *, int *);
- int ftocstr(char *ds, int dl, char *ss, int sl)
-     /* dst, src ptrs */
-     /* dst max len */
-     /* src len */
- {
-     char *p;
-     p = ss + sl;
-     while ( --p >= ss && *p == ' ' );
-     sl = p - ss + 1;
-     dl--;
-     ds[0] = 0;
-     if (sl > dl)
-       return 1;
-     while (sl--)
-       (*ds++ = *ss++);
-     *ds = '\0';
-     return 0;
- }
- int ctofstr(char *ds, int dl, char *ss)
-      /* dest space */
-      /* max dest length */
-      /* src string (0-term) */
- {
-     while (dl && *ss) {
-       *ds++ = *ss++;
-       dl--;
-     }
-     while (dl--)
-       *ds++ = ' ';
-     return 0;
- }
- void
- F77_FUNC(xdrfbool,XDRFBOOL)(int *xdrid, int *pb, int *ret) 
- {
-         xdr_fortran_lock();
-       *ret = xdr_bool(xdridptr[*xdrid], pb);
-       cnt += XDR_INT_SIZE;
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfchar,XDRFCHAR)(int *xdrid, char *cp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_char(xdridptr[*xdrid], cp);
-       cnt += sizeof(char);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfdouble,XDRFDOUBLE)(int *xdrid, double *dp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_double(xdridptr[*xdrid], dp);
-       cnt += sizeof(double);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrffloat,XDRFFLOAT)(int *xdrid, float *fp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_float(xdridptr[*xdrid], fp);
-       cnt += sizeof(float);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfint,XDRFINT)(int *xdrid, int *ip, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_int(xdridptr[*xdrid], ip);
-       cnt += XDR_INT_SIZE;
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfshort,XDRFSHORT)(int *xdrid, short *sp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_short(xdridptr[*xdrid], sp);
-       cnt += sizeof(sp);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfuchar,XDRFUCHAR)(int *xdrid, unsigned char *ucp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_u_char(xdridptr[*xdrid], (u_char *)ucp);
-       cnt += sizeof(char);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfushort,XDRFUSHORT)(int *xdrid, unsigned short *usp, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_u_short(xdridptr[*xdrid], (unsigned short *)usp);
-       cnt += sizeof(unsigned short);
-         xdr_fortran_unlock();
- }
- void 
- F77_FUNC(xdrf3dfcoord,XDRF3DFCOORD)(int *xdrid, float *fp, int *size, float *precision, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr3dfcoord(xdridptr[*xdrid], fp, size, precision);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfstring,XDRFSTRING)(int *xdrid, char * sp_ptr,
-                               int *maxsize, int *ret, int sp_len)
- {
-       char *tsp;
-         xdr_fortran_lock();
-       tsp = (char*) malloc((size_t)(((sp_len) + 1) * sizeof(char)));
-       if (tsp == NULL) {
-           *ret = -1;
-           return;
-       }
-       if (ftocstr(tsp, *maxsize+1, sp_ptr, sp_len)) {
-           *ret = -1;
-           free(tsp);
-             xdr_fortran_unlock();
-           return;
-       }
-         *ret = xdr_string(xdridptr[*xdrid], (char **) &tsp, (unsigned int) *maxsize);
-       ctofstr( sp_ptr, sp_len , tsp);
-       cnt += *maxsize;
-       free(tsp);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfwrapstring,XDRFWRAPSTRING)(int *xdrid, char *sp_ptr,
-                                       int *ret, int sp_len)
- {
-       char *tsp;
-       int maxsize;
-         xdr_fortran_lock();
-       maxsize = (sp_len) + 1;
-       tsp = (char*) malloc((size_t)(maxsize * sizeof(char)));
-       if (tsp == NULL) {
-           *ret = -1;
-           return;
-             xdr_fortran_unlock();
-       }
-       if (ftocstr(tsp, maxsize, sp_ptr, sp_len)) {
-           *ret = -1;
-           free(tsp);
-           return;
-             xdr_fortran_unlock();
-       }
-       *ret = xdr_string(xdridptr[*xdrid], (char **) &tsp, (u_int)maxsize);
-       ctofstr( sp_ptr, sp_len, tsp);
-       cnt += maxsize;
-       free(tsp);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfopaque,XDRFOPAQUE)(int *xdrid, caddr_t *cp, int *ccnt, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_opaque(xdridptr[*xdrid], (caddr_t)*cp, (u_int)*ccnt);
-       cnt += *ccnt;
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfsetpos,XDRFSETPOS)(int *xdrid, int *pos, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdr_setpos(xdridptr[*xdrid], (u_int) *pos);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrf,XDRF)(int *xdrid, int *pos)
- {
-         xdr_fortran_lock();
-       *pos = xdr_getpos(xdridptr[*xdrid]);
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfvector,XDRFVECTOR)(int *xdrid, char *cp, int *size, F77_FUNC(xdrfproc,XDRFPROC) elproc, int *ret) 
- {
-       int lcnt;
-       cnt = 0;
-         xdr_fortran_lock();
-       for (lcnt = 0; lcnt < *size; lcnt++) {
-               elproc(xdrid, (cp+cnt) , ret);
-       }
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfclose,XDRFCLOSE)(int *xdrid, int *ret)
- {
-         xdr_fortran_lock();
-       *ret = xdrclose(xdridptr[*xdrid]);
-       cnt = 0;
-         xdr_fortran_unlock();
- }
- void
- F77_FUNC(xdrfopen,XDRFOPEN)(int *xdrid, char *fp_ptr, char *mode_ptr,
-                           int *ret, int fp_len, int mode_len)
- {
-       char fname[512];
-       char fmode[3];
-         xdr_fortran_lock();
-       if (ftocstr(fname, sizeof(fname), fp_ptr, fp_len)) {
-               *ret = 0;
-       }
-       if (ftocstr(fmode, sizeof(fmode), mode_ptr,
-                       mode_len)) {
-               *ret = 0;
-       }
-       *xdrid = xdropen(NULL, fname, fmode);
-       if (*xdrid == 0)
-               *ret = 0;
-       else 
-               *ret = 1;       
-         xdr_fortran_unlock();
- }
- /*__________________________________________________________________________
-  |
-  | xdropen - open xdr file
-  |
-  | This versions differs from xdrstdio_create, because I need to know
-  | the state of the file (read or write)  and the file descriptor
-  | so I can close the file (something xdr_destroy doesn't do).
-  |
-  | It assumes xdr_fortran_mutex is locked.
-  |
-  | NOTE: THIS FUNCTION IS NOW OBSOLETE AND ONLY PROVIDED FOR BACKWARD
-  |       COMPATIBILITY OF 3D PARTY TOOLS. IT SHOULD NOT BE USED ANYWHERE 
-  |       IN GROMACS ITSELF. 
- */
- int xdropen(XDR *xdrs, const char *filename, const char *type) {
-     static int init_done = 0;
-     enum xdr_op lmode;
-     int xdrid;
-     char newtype[5];
- #ifdef GMX_THREAD_MPI
-     if (!tMPI_Thread_mutex_trylock( &xdr_fortran_mutex ))  
-     {
-         tMPI_Thread_mutex_unlock( &xdr_fortran_mutex );
-         gmx_incons("xdropen called without locked mutex. NEVER call this function.");
-     }
- #endif 
-     if (init_done == 0) {
-       for (xdrid = 1; xdrid < MAXID; xdrid++) {
-           xdridptr[xdrid] = NULL;
-       }
-       init_done = 1;
-     }
-     xdrid = 1;
-     while (xdrid < MAXID && xdridptr[xdrid] != NULL) {
-       xdrid++;
-     }
-     if (xdrid == MAXID) {
-       return 0;
-     }
-     if (*type == 'w' || *type == 'W')
-     {
-         xdrmodes[xdrid] = 'w';
-         strcpy(newtype, "wb+");
-         lmode = XDR_ENCODE;
-     }
-     else if (*type == 'a' || *type == 'A')
-     {
-         xdrmodes[xdrid] = 'a';
-         strcpy(newtype, "ab+");
-         lmode = XDR_ENCODE;
-     }
-     else if (gmx_strncasecmp(type, "r+", 2) == 0)
-     {
-         xdrmodes[xdrid] = 'a';
-         strcpy(newtype, "rb+");
-         lmode = XDR_ENCODE;
-     }
-     else
-     {
-         xdrmodes[xdrid] = 'r';
-         strcpy(newtype, "rb");
-         lmode = XDR_DECODE;
-     }
-     xdrfiles[xdrid] = fopen(filename, newtype);
-       
-     if (xdrfiles[xdrid] == NULL) {
-       xdrs = NULL;
-       return 0;
-     }
-     
-     /* next test isn't useful in the case of C language
-      * but is used for the Fortran interface
-      * (C users are expected to pass the address of an already allocated
-      * XDR staructure)
-      */
-     if (xdrs == NULL) {
-       xdridptr[xdrid] = (XDR *) malloc((size_t)sizeof(XDR));
-       xdrstdio_create(xdridptr[xdrid], xdrfiles[xdrid], lmode);
-     } else {
-       xdridptr[xdrid] = xdrs;
-       xdrstdio_create(xdrs, xdrfiles[xdrid], lmode);
-     }
-     return xdrid;
- }
- /*_________________________________________________________________________
-  |
-  | xdrclose - close a xdr file
-  |
-  | This will flush the xdr buffers, and destroy the xdr stream.
-  | It also closes the associated file descriptor (this is *not*
-  | done by xdr_destroy).
-  |
-  | It assumes xdr_fortran_mutex is locked.
-  |
-  | NOTE: THIS FUNCTION IS NOW OBSOLETE AND ONLY PROVIDED FOR BACKWARD
-  |       COMPATIBILITY OF 3D PARTY TOOLS. IT SHOULD NOT BE USED ANYWHERE 
-  |       IN GROMACS ITSELF. 
- */
-  
- int xdrclose(XDR *xdrs) {
-     int xdrid;
-     int rc = 0;
- #ifdef GMX_THREAD_MPI
-     if (!tMPI_Thread_mutex_trylock( &xdr_fortran_mutex ))  
-     {
-         tMPI_Thread_mutex_unlock( &xdr_fortran_mutex );
-         gmx_incons("xdropen called without locked mutex. NEVER call this function");
-     }
- #endif
-     if (xdrs == NULL) {
-       fprintf(stderr, "xdrclose: passed a NULL pointer\n");
-       exit(1);
-     }
-     for (xdrid = 1; xdrid < MAXID && rc==0; xdrid++) {
-       if (xdridptr[xdrid] == xdrs) {
-           
-           xdr_destroy(xdrs);
-           rc = fclose(xdrfiles[xdrid]);
-           xdridptr[xdrid] = NULL;
-           return !rc; /* xdr routines return 0 when ok */
-       }
-     } 
-     fprintf(stderr, "xdrclose: no such open xdr file\n");
-     exit(1);
-     
-     /* to make some compilers happy: */
-     return 0;    
- }
- #endif /* GMX_FORTRAN */
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <limits.h>
 +#include <math.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include "statutil.h"
 +#include "xdrf.h"
 +#include "string2.h"
 +#include "futil.h"
 +#include "gmx_fatal.h"
 +
 +
 +#if 0
 +#ifdef HAVE_FSEEKO
 +#  define gmx_fseek(A,B,C) fseeko(A,B,C)
 +#  define gmx_ftell(A) ftello(A)
 +#  define gmx_off_t off_t
 +#else
 +#  define gmx_fseek(A,B,C) fseek(A,B,C)
 +#  define gmx_ftell(A) ftell(A)
 +#  define gmx_off_t int
 +#endif
 +#endif
 +
 +
 +/* This is just for clarity - it can never be anything but 4! */
 +#define XDR_INT_SIZE 4
 +
 +/* same order as the definition of xdr_datatype */
 +const char *xdr_datatype_names[] =
 +{
 +    "int",
 +    "float",
 +    "double",
 +    "large int",
 +    "char",
 +    "string"
 +};
 +
 +
 +/*___________________________________________________________________________
 + |
 + | what follows are the C routine to read/write compressed coordinates together
 + | with some routines to assist in this task (those are marked
 + | static and cannot be called from user programs)
 +*/
 +#define MAXABS INT_MAX-2
 +
 +#ifndef MIN
 +#define MIN(x,y) ((x) < (y) ? (x):(y))
 +#endif
 +#ifndef MAX
 +#define MAX(x,y) ((x) > (y) ? (x):(y))
 +#endif
 +#ifndef SQR
 +#define SQR(x) ((x)*(x))
 +#endif
 +static const int magicints[] = {
 +    0, 0, 0, 0, 0, 0, 0, 0, 0,
 +    8, 10, 12, 16, 20, 25, 32, 40, 50, 64,
 +    80, 101, 128, 161, 203, 256, 322, 406, 512, 645,
 +    812, 1024, 1290, 1625, 2048, 2580, 3250, 4096, 5060, 6501,
 +    8192, 10321, 13003, 16384, 20642, 26007, 32768, 41285, 52015, 65536,
 +    82570, 104031, 131072, 165140, 208063, 262144, 330280, 416127, 524287, 660561,
 +    832255, 1048576, 1321122, 1664510, 2097152, 2642245, 3329021, 4194304, 5284491, 6658042,
 +    8388607, 10568983, 13316085, 16777216 };
 +
 +#define FIRSTIDX 9
 +/* note that magicints[FIRSTIDX-1] == 0 */
 +#define LASTIDX (sizeof(magicints) / sizeof(*magicints))
 +
 +
 +/*____________________________________________________________________________
 + |
 + | sendbits - encode num into buf using the specified number of bits
 + |
 + | This routines appends the value of num to the bits already present in
 + | the array buf. You need to give it the number of bits to use and you
 + | better make sure that this number of bits is enough to hold the value
 + | Also num must be positive.
 + |
 +*/
 +
 +static void sendbits(int buf[], int num_of_bits, int num) {
 +    
 +    unsigned int cnt, lastbyte;
 +    int lastbits;
 +    unsigned char * cbuf;
 +    
 +    cbuf = ((unsigned char *)buf) + 3 * sizeof(*buf);
 +    cnt = (unsigned int) buf[0];
 +    lastbits = buf[1];
 +    lastbyte =(unsigned int) buf[2];
 +    while (num_of_bits >= 8) {
 +      lastbyte = (lastbyte << 8) | ((num >> (num_of_bits -8)) /* & 0xff*/);
 +      cbuf[cnt++] = lastbyte >> lastbits;
 +      num_of_bits -= 8;
 +    }
 +    if (num_of_bits > 0) {
 +      lastbyte = (lastbyte << num_of_bits) | num;
 +      lastbits += num_of_bits;
 +      if (lastbits >= 8) {
 +          lastbits -= 8;
 +          cbuf[cnt++] = lastbyte >> lastbits;
 +      }
 +    }
 +    buf[0] = cnt;
 +    buf[1] = lastbits;
 +    buf[2] = lastbyte;
 +    if (lastbits>0) {
 +      cbuf[cnt] = lastbyte << (8 - lastbits);
 +    }
 +}
 +
 +/*_________________________________________________________________________
 + |
 + | sizeofint - calculate bitsize of an integer
 + |
 + | return the number of bits needed to store an integer with given max size
 + |
 +*/
 +
 +static int sizeofint(const int size) {
 +    int num = 1;
 +    int num_of_bits = 0;
 +    
 +    while (size >= num && num_of_bits < 32) {
 +      num_of_bits++;
 +      num <<= 1;
 +    }
 +    return num_of_bits;
 +}
 +
 +/*___________________________________________________________________________
 + |
 + | sizeofints - calculate 'bitsize' of compressed ints
 + |
 + | given the number of small unsigned integers and the maximum value
 + | return the number of bits needed to read or write them with the
 + | routines receiveints and sendints. You need this parameter when
 + | calling these routines. Note that for many calls I can use
 + | the variable 'smallidx' which is exactly the number of bits, and
 + | So I don't need to call 'sizeofints for those calls.
 +*/
 +
 +static int sizeofints( const int num_of_ints, unsigned int sizes[]) {
 +    int i, num;
 +      int bytes[32];
 +    unsigned int num_of_bytes, num_of_bits, bytecnt, tmp;
 +    num_of_bytes = 1;
 +    bytes[0] = 1;
 +    num_of_bits = 0;
 +    for (i=0; i < num_of_ints; i++) { 
 +      tmp = 0;
 +      for (bytecnt = 0; bytecnt < num_of_bytes; bytecnt++) {
 +          tmp = bytes[bytecnt] * sizes[i] + tmp;
 +          bytes[bytecnt] = tmp & 0xff;
 +          tmp >>= 8;
 +      }
 +      while (tmp != 0) {
 +          bytes[bytecnt++] = tmp & 0xff;
 +          tmp >>= 8;
 +      }
 +      num_of_bytes = bytecnt;
 +    }
 +    num = 1;
 +    num_of_bytes--;
 +    while (bytes[num_of_bytes] >= num) {
 +      num_of_bits++;
 +      num *= 2;
 +    }
 +    return num_of_bits + num_of_bytes * 8;
 +
 +}
 +    
 +/*____________________________________________________________________________
 + |
 + | sendints - send a small set of small integers in compressed format
 + |
 + | this routine is used internally by xdr3dfcoord, to send a set of
 + | small integers to the buffer. 
 + | Multiplication with fixed (specified maximum ) sizes is used to get
 + | to one big, multibyte integer. Allthough the routine could be
 + | modified to handle sizes bigger than 16777216, or more than just
 + | a few integers, this is not done, because the gain in compression
 + | isn't worth the effort. Note that overflowing the multiplication
 + | or the byte buffer (32 bytes) is unchecked and causes bad results.
 + |
 + */
 + 
 +static void sendints(int buf[], const int num_of_ints, const int num_of_bits,
 +      unsigned int sizes[], unsigned int nums[]) {
 +
 +    int i, num_of_bytes, bytecnt;
 +    unsigned int bytes[32], tmp;
 +
 +    tmp = nums[0];
 +    num_of_bytes = 0;
 +    do {
 +      bytes[num_of_bytes++] = tmp & 0xff;
 +      tmp >>= 8;
 +    } while (tmp != 0);
 +
 +    for (i = 1; i < num_of_ints; i++) {
 +      if (nums[i] >= sizes[i]) {
 +          fprintf(stderr,"major breakdown in sendints num %u doesn't "
 +                  "match size %u\n", nums[i], sizes[i]);
 +          exit(1);
 +      }
 +      /* use one step multiply */    
 +      tmp = nums[i];
 +      for (bytecnt = 0; bytecnt < num_of_bytes; bytecnt++) {
 +          tmp = bytes[bytecnt] * sizes[i] + tmp;
 +          bytes[bytecnt] = tmp & 0xff;
 +          tmp >>= 8;
 +      }
 +      while (tmp != 0) {
 +          bytes[bytecnt++] = tmp & 0xff;
 +          tmp >>= 8;
 +      }
 +      num_of_bytes = bytecnt;
 +    }
 +    if (num_of_bits >= num_of_bytes * 8) {
 +      for (i = 0; i < num_of_bytes; i++) {
 +          sendbits(buf, 8, bytes[i]);
 +      }
 +      sendbits(buf, num_of_bits - num_of_bytes * 8, 0);
 +    } else {
 +      for (i = 0; i < num_of_bytes-1; i++) {
 +          sendbits(buf, 8, bytes[i]);
 +      }
 +      sendbits(buf, num_of_bits- (num_of_bytes -1) * 8, bytes[i]);
 +    }
 +}
 +
 +
 +/*___________________________________________________________________________
 + |
 + | receivebits - decode number from buf using specified number of bits
 + | 
 + | extract the number of bits from the array buf and construct an integer
 + | from it. Return that value.
 + |
 +*/
 +
 +static int receivebits(int buf[], int num_of_bits) {
 +
 +    int cnt, num, lastbits; 
 +    unsigned int lastbyte;
 +    unsigned char * cbuf;
 +    int mask = (1 << num_of_bits) -1;
 +
 +    cbuf = ((unsigned char *)buf) + 3 * sizeof(*buf);
 +    cnt = buf[0];
 +    lastbits = (unsigned int) buf[1];
 +    lastbyte = (unsigned int) buf[2];
 +    
 +    num = 0;
 +    while (num_of_bits >= 8) {
 +      lastbyte = ( lastbyte << 8 ) | cbuf[cnt++];
 +      num |=  (lastbyte >> lastbits) << (num_of_bits - 8);
 +      num_of_bits -=8;
 +    }
 +    if (num_of_bits > 0) {
 +      if (lastbits < num_of_bits) {
 +          lastbits += 8;
 +          lastbyte = (lastbyte << 8) | cbuf[cnt++];
 +      }
 +      lastbits -= num_of_bits;
 +      num |= (lastbyte >> lastbits) & ((1 << num_of_bits) -1);
 +    }
 +    num &= mask;
 +    buf[0] = cnt;
 +    buf[1] = lastbits;
 +    buf[2] = lastbyte;
 +    return num; 
 +}
 +
 +/*____________________________________________________________________________
 + |
 + | receiveints - decode 'small' integers from the buf array
 + |
 + | this routine is the inverse from sendints() and decodes the small integers
 + | written to buf by calculating the remainder and doing divisions with
 + | the given sizes[]. You need to specify the total number of bits to be
 + | used from buf in num_of_bits.
 + |
 +*/
 +
 +static void receiveints(int buf[], const int num_of_ints, int num_of_bits,
 +      unsigned int sizes[], int nums[]) {
 +    int bytes[32];
 +    int i, j, num_of_bytes, p, num;
 +    
 +    bytes[0] = bytes[1] = bytes[2] = bytes[3] = 0;
 +    num_of_bytes = 0;
 +    while (num_of_bits > 8) {
 +      bytes[num_of_bytes++] = receivebits(buf, 8);
 +      num_of_bits -= 8;
 +    }
 +    if (num_of_bits > 0) {
 +      bytes[num_of_bytes++] = receivebits(buf, num_of_bits);
 +    }
 +    for (i = num_of_ints-1; i > 0; i--) {
 +      num = 0;
 +      for (j = num_of_bytes-1; j >=0; j--) {
 +          num = (num << 8) | bytes[j];
 +          p = num / sizes[i];
 +          bytes[j] = p;
 +          num = num - p * sizes[i];
 +      }
 +      nums[i] = num;
 +    }
 +    nums[0] = bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24);
 +}
 +    
 +/*____________________________________________________________________________
 + |
 + | xdr3dfcoord - read or write compressed 3d coordinates to xdr file.
 + |
 + | this routine reads or writes (depending on how you opened the file with
 + | xdropen() ) a large number of 3d coordinates (stored in *fp).
 + | The number of coordinates triplets to write is given by *size. On
 + | read this number may be zero, in which case it reads as many as were written
 + | or it may specify the number if triplets to read (which should match the
 + | number written).
 + | Compression is achieved by first converting all floating numbers to integer
 + | using multiplication by *precision and rounding to the nearest integer.
 + | Then the minimum and maximum value are calculated to determine the range.
 + | The limited range of integers so found, is used to compress the coordinates.
 + | In addition the differences between succesive coordinates is calculated.
 + | If the difference happens to be 'small' then only the difference is saved,
 + | compressing the data even more. The notion of 'small' is changed dynamically
 + | and is enlarged or reduced whenever needed or possible.
 + | Extra compression is achieved in the case of GROMOS and coordinates of
 + | water molecules. GROMOS first writes out the Oxygen position, followed by
 + | the two hydrogens. In order to make the differences smaller (and thereby
 + | compression the data better) the order is changed into first one hydrogen
 + | then the oxygen, followed by the other hydrogen. This is rather special, but
 + | it shouldn't harm in the general case.
 + |
 + */
 + 
 +int xdr3dfcoord(XDR *xdrs, float *fp, int *size, float *precision) 
 +{
 +    int *ip = NULL;
 +    int *buf = NULL;
 +    gmx_bool bRead;
 +        
 +    /* preallocate a small buffer and ip on the stack - if we need more
 +       we can always malloc(). This is faster for small values of size: */
 +    unsigned prealloc_size=3*16;
 +    int prealloc_ip[3*16], prealloc_buf[3*20];
 +    int we_should_free=0;
 +
 +    int minint[3], maxint[3], mindiff, *lip, diff;
 +    int lint1, lint2, lint3, oldlint1, oldlint2, oldlint3, smallidx;
 +    int minidx, maxidx;
 +    unsigned sizeint[3], sizesmall[3], bitsizeint[3], size3, *luip;
 +    int flag, k;
 +    int smallnum, smaller, larger, i, is_small, is_smaller, run, prevrun;
 +    float *lfp, lf;
 +    int tmp, *thiscoord,  prevcoord[3];
 +    unsigned int tmpcoord[30];
 +
 +    int bufsize, xdrid, lsize;
 +    unsigned int bitsize;
 +    float inv_precision;
 +    int errval = 1;
 +    int rc;
 +      
 +    bRead = (xdrs->x_op == XDR_DECODE);
 +    bitsizeint[0] = bitsizeint[1] = bitsizeint[2] = 0;
 +    prevcoord[0]  = prevcoord[1]  = prevcoord[2]  = 0;
 +   
 +    if (!bRead)
 +    {
 +      /* xdrs is open for writing */
 +
 +      if (xdr_int(xdrs, size) == 0)
 +          return 0;
 +      size3 = *size * 3;
 +      /* when the number of coordinates is small, don't try to compress; just
 +       * write them as floats using xdr_vector
 +       */
 +      if (*size <= 9 ) {
 +            return (xdr_vector(xdrs, (char *) fp, (unsigned int)size3, 
 +                    (unsigned int)sizeof(*fp), (xdrproc_t)xdr_float));
 +      }
 +      
 +      if(xdr_float(xdrs, precision) == 0)
 +            return 0;
 +
 +        if (size3 <= prealloc_size)
 +        {
 +            ip=prealloc_ip;
 +            buf=prealloc_buf;
 +        }
 +        else
 +        {
 +            we_should_free=1;
 +          bufsize = size3 * 1.2;
 +          ip = (int *)malloc((size_t)(size3 * sizeof(*ip)));
 +          buf = (int *)malloc((size_t)(bufsize * sizeof(*buf)));
 +          if (ip == NULL || buf==NULL) 
 +            {
 +              fprintf(stderr,"malloc failed\n");
 +              exit(1);
 +          }
 +        }
 +      /* buf[0-2] are special and do not contain actual data */
 +      buf[0] = buf[1] = buf[2] = 0;
 +      minint[0] = minint[1] = minint[2] = INT_MAX;
 +      maxint[0] = maxint[1] = maxint[2] = INT_MIN;
 +      prevrun = -1;
 +      lfp = fp;
 +      lip = ip;
 +      mindiff = INT_MAX;
 +      oldlint1 = oldlint2 = oldlint3 = 0;
 +      while(lfp < fp + size3 ) {
 +          /* find nearest integer */
 +          if (*lfp >= 0.0)
 +              lf = *lfp * *precision + 0.5;
 +          else
 +              lf = *lfp * *precision - 0.5;
 +          if (fabs(lf) > MAXABS) {
 +              /* scaling would cause overflow */
 +              errval = 0;
 +          }
 +          lint1 = lf;
 +          if (lint1 < minint[0]) minint[0] = lint1;
 +          if (lint1 > maxint[0]) maxint[0] = lint1;
 +          *lip++ = lint1;
 +          lfp++;
 +          if (*lfp >= 0.0)
 +              lf = *lfp * *precision + 0.5;
 +          else
 +              lf = *lfp * *precision - 0.5;
 +          if (fabs(lf) > MAXABS) {
 +              /* scaling would cause overflow */
 +              errval = 0;
 +          }
 +          lint2 = lf;
 +          if (lint2 < minint[1]) minint[1] = lint2;
 +          if (lint2 > maxint[1]) maxint[1] = lint2;
 +          *lip++ = lint2;
 +          lfp++;
 +          if (*lfp >= 0.0)
 +              lf = *lfp * *precision + 0.5;
 +          else
 +              lf = *lfp * *precision - 0.5;
 +          if (fabs(lf) > MAXABS) {
 +              /* scaling would cause overflow */
 +              errval = 0;
 +          }
 +          lint3 = lf;
 +          if (lint3 < minint[2]) minint[2] = lint3;
 +          if (lint3 > maxint[2]) maxint[2] = lint3;
 +          *lip++ = lint3;
 +          lfp++;
 +          diff = abs(oldlint1-lint1)+abs(oldlint2-lint2)+abs(oldlint3-lint3);
 +          if (diff < mindiff && lfp > fp + 3)
 +              mindiff = diff;
 +          oldlint1 = lint1;
 +          oldlint2 = lint2;
 +          oldlint3 = lint3;
 +      }
 +      if ( (xdr_int(xdrs, &(minint[0])) == 0) ||
 +               (xdr_int(xdrs, &(minint[1])) == 0) ||
 +               (xdr_int(xdrs, &(minint[2])) == 0) ||
 +           (xdr_int(xdrs, &(maxint[0])) == 0) ||
 +               (xdr_int(xdrs, &(maxint[1])) == 0) ||
 +               (xdr_int(xdrs, &(maxint[2])) == 0))
 +      {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +      }
 +      
 +      if ((float)maxint[0] - (float)minint[0] >= MAXABS ||
 +              (float)maxint[1] - (float)minint[1] >= MAXABS ||
 +              (float)maxint[2] - (float)minint[2] >= MAXABS) {
 +          /* turning value in unsigned by subtracting minint
 +           * would cause overflow
 +           */
 +          errval = 0;
 +      }
 +      sizeint[0] = maxint[0] - minint[0]+1;
 +      sizeint[1] = maxint[1] - minint[1]+1;
 +      sizeint[2] = maxint[2] - minint[2]+1;
 +      
 +      /* check if one of the sizes is to big to be multiplied */
 +      if ((sizeint[0] | sizeint[1] | sizeint[2] ) > 0xffffff) {
 +          bitsizeint[0] = sizeofint(sizeint[0]);
 +          bitsizeint[1] = sizeofint(sizeint[1]);
 +          bitsizeint[2] = sizeofint(sizeint[2]);
 +          bitsize = 0; /* flag the use of large sizes */
 +      } else {
 +          bitsize = sizeofints(3, sizeint);
 +      }
 +      lip = ip;
 +      luip = (unsigned int *) ip;
 +      smallidx = FIRSTIDX;
 +      while (smallidx < LASTIDX && magicints[smallidx] < mindiff) {
 +          smallidx++;
 +      }
 +      if(xdr_int(xdrs, &smallidx) == 0)
 +        {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +        }
 +              
 +      maxidx = MIN(LASTIDX, smallidx + 8) ;
 +      minidx = maxidx - 8; /* often this equal smallidx */
 +      smaller = magicints[MAX(FIRSTIDX, smallidx-1)] / 2;
 +      smallnum = magicints[smallidx] / 2;
 +      sizesmall[0] = sizesmall[1] = sizesmall[2] = magicints[smallidx];
 +      larger = magicints[maxidx] / 2;
 +      i = 0;
 +      while (i < *size) {
 +          is_small = 0;
 +          thiscoord = (int *)(luip) + i * 3;
 +          if (smallidx < maxidx && i >= 1 &&
 +                  abs(thiscoord[0] - prevcoord[0]) < larger &&
 +                  abs(thiscoord[1] - prevcoord[1]) < larger &&
 +                  abs(thiscoord[2] - prevcoord[2]) < larger) {
 +              is_smaller = 1;
 +          } else if (smallidx > minidx) {
 +              is_smaller = -1;
 +          } else {
 +              is_smaller = 0;
 +          }
 +          if (i + 1 < *size) {
 +              if (abs(thiscoord[0] - thiscoord[3]) < smallnum &&
 +                      abs(thiscoord[1] - thiscoord[4]) < smallnum &&
 +                      abs(thiscoord[2] - thiscoord[5]) < smallnum) {
 +                  /* interchange first with second atom for better
 +                   * compression of water molecules
 +                   */
 +                  tmp = thiscoord[0]; thiscoord[0] = thiscoord[3];
 +                      thiscoord[3] = tmp;
 +                  tmp = thiscoord[1]; thiscoord[1] = thiscoord[4];
 +                      thiscoord[4] = tmp;
 +                  tmp = thiscoord[2]; thiscoord[2] = thiscoord[5];
 +                      thiscoord[5] = tmp;
 +                  is_small = 1;
 +              }
 +    
 +          }
 +          tmpcoord[0] = thiscoord[0] - minint[0];
 +          tmpcoord[1] = thiscoord[1] - minint[1];
 +          tmpcoord[2] = thiscoord[2] - minint[2];
 +          if (bitsize == 0) {
 +              sendbits(buf, bitsizeint[0], tmpcoord[0]);
 +              sendbits(buf, bitsizeint[1], tmpcoord[1]);
 +              sendbits(buf, bitsizeint[2], tmpcoord[2]);
 +          } else {
 +              sendints(buf, 3, bitsize, sizeint, tmpcoord);
 +          }
 +          prevcoord[0] = thiscoord[0];
 +          prevcoord[1] = thiscoord[1];
 +          prevcoord[2] = thiscoord[2];
 +          thiscoord = thiscoord + 3;
 +          i++;
 +          
 +          run = 0;
 +          if (is_small == 0 && is_smaller == -1)
 +              is_smaller = 0;
 +          while (is_small && run < 8*3) {
 +              if (is_smaller == -1 && (
 +                      SQR(thiscoord[0] - prevcoord[0]) +
 +                      SQR(thiscoord[1] - prevcoord[1]) +
 +                      SQR(thiscoord[2] - prevcoord[2]) >= smaller * smaller)) {
 +                  is_smaller = 0;
 +              }
 +
 +              tmpcoord[run++] = thiscoord[0] - prevcoord[0] + smallnum;
 +              tmpcoord[run++] = thiscoord[1] - prevcoord[1] + smallnum;
 +              tmpcoord[run++] = thiscoord[2] - prevcoord[2] + smallnum;
 +              
 +              prevcoord[0] = thiscoord[0];
 +              prevcoord[1] = thiscoord[1];
 +              prevcoord[2] = thiscoord[2];
 +
 +              i++;
 +              thiscoord = thiscoord + 3;
 +              is_small = 0;
 +              if (i < *size &&
 +                      abs(thiscoord[0] - prevcoord[0]) < smallnum &&
 +                      abs(thiscoord[1] - prevcoord[1]) < smallnum &&
 +                      abs(thiscoord[2] - prevcoord[2]) < smallnum) {
 +                  is_small = 1;
 +              }
 +          }
 +          if (run != prevrun || is_smaller != 0) {
 +              prevrun = run;
 +              sendbits(buf, 1, 1); /* flag the change in run-length */
 +              sendbits(buf, 5, run+is_smaller+1);
 +          } else {
 +              sendbits(buf, 1, 0); /* flag the fact that runlength did not change */
 +          }
 +          for (k=0; k < run; k+=3) {
 +              sendints(buf, 3, smallidx, sizesmall, &tmpcoord[k]);    
 +          }
 +          if (is_smaller != 0) {
 +              smallidx += is_smaller;
 +              if (is_smaller < 0) {
 +                  smallnum = smaller;
 +                  smaller = magicints[smallidx-1] / 2;
 +              } else {
 +                  smaller = smallnum;
 +                  smallnum = magicints[smallidx] / 2;
 +              }
 +              sizesmall[0] = sizesmall[1] = sizesmall[2] = magicints[smallidx];
 +          }
 +      }
 +      if (buf[1] != 0) buf[0]++;
 +              /* buf[0] holds the length in bytes */
 +      if(xdr_int(xdrs, &(buf[0])) == 0)
 +        {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +        }
 +
 +      
 +        rc=errval * (xdr_opaque(xdrs, (char *)&(buf[3]), (unsigned int)buf[0]));
 +        if (we_should_free)
 +        {
 +            free(ip);
 +            free(buf);
 +        }
 +        return rc;
 +      
 +    } else {
 +      
 +      /* xdrs is open for reading */
 +      
 +      if (xdr_int(xdrs, &lsize) == 0) 
 +          return 0;
 +      if (*size != 0 && lsize != *size) {
 +          fprintf(stderr, "wrong number of coordinates in xdr3dfcoord; "
 +                  "%d arg vs %d in file", *size, lsize);
 +      }
 +      *size = lsize;
 +      size3 = *size * 3;
 +      if (*size <= 9) {
 +          *precision = -1;
 +            return (xdr_vector(xdrs, (char *) fp, (unsigned int)size3, 
 +                    (unsigned int)sizeof(*fp), (xdrproc_t)xdr_float));
 +      }
 +      if(xdr_float(xdrs, precision) == 0)
 +              return 0;
 +
 +        if (size3 <= prealloc_size)
 +        {
 +            ip=prealloc_ip;
 +            buf=prealloc_buf;
 +        }
 +        else
 +        {
 +            we_should_free=1;
 +          bufsize = size3 * 1.2;
 +          ip = (int *)malloc((size_t)(size3 * sizeof(*ip)));
 +          buf = (int *)malloc((size_t)(bufsize * sizeof(*buf)));
 +          if (ip == NULL || buf==NULL) 
 +            {
 +              fprintf(stderr,"malloc failed\n");
 +              exit(1);
 +          }
 +        }
 +
 +      buf[0] = buf[1] = buf[2] = 0;
 +      
 +      if ( (xdr_int(xdrs, &(minint[0])) == 0) ||
 +               (xdr_int(xdrs, &(minint[1])) == 0) ||
 +               (xdr_int(xdrs, &(minint[2])) == 0) ||
 +               (xdr_int(xdrs, &(maxint[0])) == 0) ||
 +               (xdr_int(xdrs, &(maxint[1])) == 0) ||
 +               (xdr_int(xdrs, &(maxint[2])) == 0))
 +      {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +      }
 +                      
 +      sizeint[0] = maxint[0] - minint[0]+1;
 +      sizeint[1] = maxint[1] - minint[1]+1;
 +      sizeint[2] = maxint[2] - minint[2]+1;
 +      
 +      /* check if one of the sizes is to big to be multiplied */
 +      if ((sizeint[0] | sizeint[1] | sizeint[2] ) > 0xffffff) {
 +          bitsizeint[0] = sizeofint(sizeint[0]);
 +          bitsizeint[1] = sizeofint(sizeint[1]);
 +          bitsizeint[2] = sizeofint(sizeint[2]);
 +          bitsize = 0; /* flag the use of large sizes */
 +      } else {
 +          bitsize = sizeofints(3, sizeint);
 +      }
 +      
 +      if (xdr_int(xdrs, &smallidx) == 0)      
 +        {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +        }
 +
 +      maxidx = MIN(LASTIDX, smallidx + 8) ;
 +      minidx = maxidx - 8; /* often this equal smallidx */
 +      smaller = magicints[MAX(FIRSTIDX, smallidx-1)] / 2;
 +      smallnum = magicints[smallidx] / 2;
 +      sizesmall[0] = sizesmall[1] = sizesmall[2] = magicints[smallidx] ;
 +      larger = magicints[maxidx];
 +
 +      /* buf[0] holds the length in bytes */
 +
 +      if (xdr_int(xdrs, &(buf[0])) == 0)
 +        {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +        }
 +
 +
 +      if (xdr_opaque(xdrs, (char *)&(buf[3]), (unsigned int)buf[0]) == 0)
 +        {
 +            if (we_should_free)
 +            {
 +                free(ip);
 +                free(buf);
 +            }
 +            return 0;
 +        }
 +
 +
 +
 +      buf[0] = buf[1] = buf[2] = 0;
 +      
 +      lfp = fp;
 +      inv_precision = 1.0 / * precision;
 +      run = 0;
 +      i = 0;
 +      lip = ip;
 +      while ( i < lsize ) {
 +          thiscoord = (int *)(lip) + i * 3;
 +
 +          if (bitsize == 0) {
 +              thiscoord[0] = receivebits(buf, bitsizeint[0]);
 +              thiscoord[1] = receivebits(buf, bitsizeint[1]);
 +              thiscoord[2] = receivebits(buf, bitsizeint[2]);
 +          } else {
 +              receiveints(buf, 3, bitsize, sizeint, thiscoord);
 +          }
 +          
 +          i++;
 +          thiscoord[0] += minint[0];
 +          thiscoord[1] += minint[1];
 +          thiscoord[2] += minint[2];
 +          
 +          prevcoord[0] = thiscoord[0];
 +          prevcoord[1] = thiscoord[1];
 +          prevcoord[2] = thiscoord[2];
 +          
 +         
 +          flag = receivebits(buf, 1);
 +          is_smaller = 0;
 +          if (flag == 1) {
 +              run = receivebits(buf, 5);
 +              is_smaller = run % 3;
 +              run -= is_smaller;
 +              is_smaller--;
 +          }
 +          if (run > 0) {
 +              thiscoord += 3;
 +              for (k = 0; k < run; k+=3) {
 +                  receiveints(buf, 3, smallidx, sizesmall, thiscoord);
 +                  i++;
 +                  thiscoord[0] += prevcoord[0] - smallnum;
 +                  thiscoord[1] += prevcoord[1] - smallnum;
 +                  thiscoord[2] += prevcoord[2] - smallnum;
 +                  if (k == 0) {
 +                      /* interchange first with second atom for better
 +                       * compression of water molecules
 +                       */
 +                      tmp = thiscoord[0]; thiscoord[0] = prevcoord[0];
 +                              prevcoord[0] = tmp;
 +                      tmp = thiscoord[1]; thiscoord[1] = prevcoord[1];
 +                              prevcoord[1] = tmp;
 +                      tmp = thiscoord[2]; thiscoord[2] = prevcoord[2];
 +                              prevcoord[2] = tmp;
 +                      *lfp++ = prevcoord[0] * inv_precision;
 +                      *lfp++ = prevcoord[1] * inv_precision;
 +                      *lfp++ = prevcoord[2] * inv_precision;
 +                  } else {
 +                      prevcoord[0] = thiscoord[0];
 +                      prevcoord[1] = thiscoord[1];
 +                      prevcoord[2] = thiscoord[2];
 +                  }
 +                  *lfp++ = thiscoord[0] * inv_precision;
 +                  *lfp++ = thiscoord[1] * inv_precision;
 +                  *lfp++ = thiscoord[2] * inv_precision;
 +              }
 +          } else {
 +              *lfp++ = thiscoord[0] * inv_precision;
 +              *lfp++ = thiscoord[1] * inv_precision;
 +              *lfp++ = thiscoord[2] * inv_precision;          
 +          }
 +          smallidx += is_smaller;
 +          if (is_smaller < 0) {
 +              smallnum = smaller;
 +              if (smallidx > FIRSTIDX) {
 +                  smaller = magicints[smallidx - 1] /2;
 +              } else {
 +                  smaller = 0;
 +              }
 +          } else if (is_smaller > 0) {
 +              smaller = smallnum;
 +              smallnum = magicints[smallidx] / 2;
 +          }
 +          sizesmall[0] = sizesmall[1] = sizesmall[2] = magicints[smallidx] ;
 +      }
 +    }
 +    if (we_should_free)
 +    {
 +        free(ip);
 +        free(buf);
 +    }
 +    return 1;
 +}
 +
 +
 +
 +/******************************************************************
 +
 +  XTC files have a relatively simple structure.
 +  They have a header of 16 bytes and the rest are
 +  the compressed coordinates of the files. Due to the
 +  compression 00 is not present in the coordinates.
 +  The first 4 bytes of the header are the magic number
 +  1995 (0x000007CB). If we find this number we are guaranteed
 +  to be in the header, due to the presence of so many zeros.
 +  The second 4 bytes are the number of atoms in the frame, and is
 +  assumed to be constant. The third 4 bytes are the frame number.
 +  The last 4 bytes are a floating point representation of the time.
 +
 +********************************************************************/
 +
 +/* Must match definition in xtcio.c */
 +#ifndef XTC_MAGIC
 +#define XTC_MAGIC 1995
 +#endif
 +
 +static const int header_size = 16;
 +
 +/* Check if we are at the header start.
 +   At the same time it will also read 1 int */
 +static int xtc_at_header_start(FILE *fp, XDR *xdrs,
 +                             int natoms, int * timestep, float * time){
 +  int i_inp[3];
 +  float f_inp[10];
 +  int i;
 +  gmx_off_t off;
 +
 +
 +  if((off = gmx_ftell(fp)) < 0){
 +    return -1;
 +  }
 +  /* read magic natoms and timestep */
 +  for(i = 0;i<3;i++){
 +    if(!xdr_int(xdrs, &(i_inp[i]))){
 +      gmx_fseek(fp,off+XDR_INT_SIZE,SEEK_SET);
 +      return -1;
 +    }    
 +  }
 +  /* quick return */
 +  if(i_inp[0] != XTC_MAGIC){
 +    if(gmx_fseek(fp,off+XDR_INT_SIZE,SEEK_SET)){
 +      return -1;
 +    }
 +    return 0;
 +  }
 +  /* read time and box */
 +  for(i = 0;i<10;i++){
 +    if(!xdr_float(xdrs, &(f_inp[i]))){
 +      gmx_fseek(fp,off+XDR_INT_SIZE,SEEK_SET);
 +      return -1;
 +    }    
 +  }
 +  /* Make a rigourous check to see if we are in the beggining of a header
 +     Hopefully there are no ambiguous cases */
 +  /* This check makes use of the fact that the box matrix has 3 zeroes on the upper
 +     right triangle and that the first element must be nonzero unless the entire matrix is zero
 +  */
 +  if(i_inp[1] == natoms && 
 +     ((f_inp[1] != 0 && f_inp[6] == 0) ||
 +      (f_inp[1] == 0 && f_inp[5] == 0 && f_inp[9] == 0))){
 +    if(gmx_fseek(fp,off+XDR_INT_SIZE,SEEK_SET)){
 +      return -1;
 +    }
 +    *time = f_inp[0];
 +    *timestep = i_inp[2];
 +    return 1;
 +  }
 +  if(gmx_fseek(fp,off+XDR_INT_SIZE,SEEK_SET)){
 +    return -1;
 +  }
 +  return 0;         
 +}
 +
 +static int 
 +xtc_get_next_frame_number(FILE *fp, XDR *xdrs, int natoms)
 +{
 +    gmx_off_t off;
 +    int step;  
 +    float time;
 +    int ret;
 +
 +    if((off = gmx_ftell(fp)) < 0){
 +      return -1;
 +    }
 +
 +    /* read one int just to make sure we dont read this frame but the next */
 +    xdr_int(xdrs,&step);
 +    while(1){
 +      ret = xtc_at_header_start(fp,xdrs,natoms,&step,&time);
 +      if(ret == 1){
 +      if(gmx_fseek(fp,off,SEEK_SET)){
 +        return -1;
 +      }
 +      return step;
 +      }else if(ret == -1){
 +      if(gmx_fseek(fp,off,SEEK_SET)){
 +        return -1;
 +      }
 +      }
 +    }
 +    return -1;
 +}
 +
 +
 +static float xtc_get_next_frame_time(FILE *fp, XDR *xdrs, int natoms,
 +                                     gmx_bool * bOK)
 +{
 +    gmx_off_t off;
 +    float time;
 +    int step;
 +    int ret;
 +    *bOK = 0;
 +
 +    if ((off = gmx_ftell(fp)) < 0)
 +    {
 +        return -1;
 +    }
 +    /* read one int just to make sure we dont read this frame but the next */
 +    xdr_int(xdrs, &step);
 +    while (1)
 +    {
 +        ret = xtc_at_header_start(fp, xdrs, natoms, &step, &time);
 +        if (ret == 1)
 +        {
 +            *bOK = 1;
 +            if (gmx_fseek(fp,off,SEEK_SET))
 +            {
 +                *bOK = 0;
 +                return -1;
 +            }
 +            return time;
 +        }
 +        else if (ret == -1)
 +        {
 +            if (gmx_fseek(fp,off,SEEK_SET))
 +            {
 +                return -1;
 +            }
 +            return -1;
 +        }
 +    }
 +    return -1;
 +}
 +
 +
 +static float 
 +xtc_get_current_frame_time(FILE *fp, XDR *xdrs, int natoms, gmx_bool * bOK)
 +{
 +    gmx_off_t off;
 +    int step;  
 +    float time;
 +    int ret;
 +    *bOK = 0;
 +
 +    if ((off = gmx_ftell(fp)) < 0)
 +    {
 +        return -1;
 +    }
 +
 +    while (1)
 +    {
 +        ret = xtc_at_header_start(fp, xdrs, natoms, &step, &time);
 +        if (ret == 1)
 +        {
 +            *bOK = 1;
 +            if (gmx_fseek(fp,off,SEEK_SET))
 +            {
 +                *bOK = 0;
 +                return -1;
 +            }
 +            return time;
 +        }
 +        else if (ret == -1)
 +        {
 +            if (gmx_fseek(fp,off,SEEK_SET))
 +            {
 +                return -1;
 +            }
 +            return -1;
 +        }
 +        else if (ret == 0)
 +        {
 +            /*Go back.*/
 +            if (gmx_fseek(fp,-2*XDR_INT_SIZE,SEEK_CUR))
 +            {
 +                return -1;
 +            }
 +        }
 +    }
 +    return -1;
 +}
 +
 +/* Currently not used, just for completeness */
 +static int 
 +xtc_get_current_frame_number(FILE *fp,XDR *xdrs,int natoms, gmx_bool * bOK)
 +{
 +    gmx_off_t off;
 +    int ret;  
 +    int step;
 +    float time;
 +    *bOK = 0;
 +    
 +    if((off = gmx_ftell(fp)) < 0){
 +      return -1;
 +    }
 +
 +
 +    while(1){
 +      ret = xtc_at_header_start(fp,xdrs,natoms,&step,&time);
 +      if(ret == 1){
 +      *bOK = 1;
 +      if(gmx_fseek(fp,off,SEEK_SET)){
 +              *bOK = 0;
 +        return -1;
 +      }
 +      return step;
 +      }else if(ret == -1){
 +      if(gmx_fseek(fp,off,SEEK_SET)){
 +        return -1;
 +      }
 +      return -1;
 +                
 +      }else if(ret == 0){
 +                /*Go back.*/
 +                if(gmx_fseek(fp,-2*XDR_INT_SIZE,SEEK_CUR)){
 +                        return -1;
 +                }
 +      }
 +    }
 +    return -1;
 +}
 +
 +
 +
 +static gmx_off_t xtc_get_next_frame_start(FILE *fp, XDR *xdrs, int natoms)
 +{
 +    int inp;
 +    gmx_off_t res;
 +    int ret;
 +    int step;
 +    float time;
 +    /* read one int just to make sure we dont read this frame but the next */
 +    xdr_int(xdrs,&step);
 +    while(1)
 +    {
 +      ret = xtc_at_header_start(fp,xdrs,natoms,&step,&time);
 +      if(ret == 1){
 +      if((res = gmx_ftell(fp)) >= 0){
 +        return res - XDR_INT_SIZE;
 +      }else{
 +        return res;
 +      }
 +      }else if(ret == -1){
 +      return -1;
 +      }
 +    }
 +    return -1;
 +}
 +
 +
 +static
 +float 
 +xdr_xtc_estimate_dt(FILE *fp, XDR *xdrs, int natoms, gmx_bool * bOK)
 +{
 +  float  res;
 +  float  tinit;
 +  gmx_off_t off;
 +  
 +  *bOK = 0;
 +  if((off   = gmx_ftell(fp)) < 0){
 +    return -1;
 +  }
 +  
 +    tinit = xtc_get_current_frame_time(fp,xdrs,natoms,bOK);
 +    
 +    if(!(*bOK))
 +    {
 +        return -1;
 +    }
 +    
 +    res = xtc_get_next_frame_time(fp,xdrs,natoms,bOK);
 +    
 +    if(!(*bOK))
 +    {
 +        return -1;
 +    }
 +    
 +    res -= tinit;
 +    if (0 != gmx_fseek(fp,off,SEEK_SET)) {
 +      *bOK = 0;
 +      return -1;
 +    }
 +    return res;
 +}
 +
 +
 +int 
 +xdr_xtc_seek_frame(int frame, FILE *fp, XDR *xdrs, int natoms)
 +{
 +    gmx_off_t low = 0;
 +    gmx_off_t high,pos;
 +
 +    
 +    /* round to 4 bytes */
 +    int fr;
 +    gmx_off_t  offset;
 +    if(gmx_fseek(fp,0,SEEK_END)){
 +      return -1;
 +    }
 +
 +    if((high = gmx_ftell(fp)) < 0){
 +      return -1;
 +    }
 +    
 +    /* round to 4 bytes  */
 +    high /= XDR_INT_SIZE;
 +    high *= XDR_INT_SIZE;
 +    offset = ((high/2)/XDR_INT_SIZE)*XDR_INT_SIZE;
 +    
 +    if(gmx_fseek(fp,offset,SEEK_SET)){
 +      return -1;
 +    }
 +    
 +    while(1)
 +    {
 +        fr = xtc_get_next_frame_number(fp,xdrs,natoms);
 +        if(fr < 0)
 +        {
 +            return -1;
 +        }
 +        if(fr != frame && abs(low-high) > header_size)
 +        {
 +            if(fr < frame)
 +            {
 +                low = offset;      
 +            }
 +            else
 +            {
 +                high = offset;      
 +            }
 +            /* round to 4 bytes */
 +            offset = (((high+low)/2)/4)*4;
 +            
 +            if(gmx_fseek(fp,offset,SEEK_SET)){
 +            return -1;
 +          }
 +        }
 +        else
 +        {
 +            break;
 +        }
 +    }
 +    if(offset <= header_size)
 +    {
 +        offset = low;
 +    }
 +    
 +    if(gmx_fseek(fp,offset,SEEK_SET)){
 +      return -1;
 +    }
 +
 +    if((pos = xtc_get_next_frame_start(fp,xdrs,natoms))< 0){
 +    /* we probably hit an end of file */
 +      return -1;
 +    }
 +    
 +    if(gmx_fseek(fp,pos,SEEK_SET)){
 +      return -1;
 +    }
 +    
 +    return 0;
 +}
 +
 +     
 +
 +int xdr_xtc_seek_time(real time, FILE *fp, XDR *xdrs, int natoms,gmx_bool bSeekForwardOnly)
 +{
 +    float t;
 +    float dt;
 +    gmx_bool bOK = FALSE;
 +    gmx_off_t low = 0;
 +    gmx_off_t high, offset, pos;
 +    int res;
 +    int dt_sign = 0;
 +
 +    if (bSeekForwardOnly)
 +    {
 +        low = gmx_ftell(fp);
 +    }
 +    if (gmx_fseek(fp,0,SEEK_END))
 +    {
 +        return -1;
 +    }
 +
 +    if ((high = gmx_ftell(fp)) < 0)
 +    {
 +        return -1;
 +    }
 +    /* round to int  */
 +    high /= XDR_INT_SIZE;
 +    high *= XDR_INT_SIZE;
 +    offset = (((high-low) / 2) / XDR_INT_SIZE) * XDR_INT_SIZE;
 +
 +    if (gmx_fseek(fp,offset,SEEK_SET))
 +    {
 +        return -1;
 +    }
 +
 +    
 +    /*
 +     * No need to call xdr_xtc_estimate_dt here - since xdr_xtc_estimate_dt is called first thing in the loop
 +    dt = xdr_xtc_estimate_dt(fp, xdrs, natoms, &bOK);
 +
 +    if (!bOK)
 +    {
 +        return -1;
 +    }
 +    */
 +
 +    while (1)
 +    {
 +        dt = xdr_xtc_estimate_dt(fp, xdrs, natoms, &bOK);
 +        if (!bOK)
 +        {
 +            return -1;
 +        }
 +        else
 +        {
 +            if (dt > 0)
 +            {
 +                if (dt_sign == -1)
 +                {
 +                    /* Found a place in the trajectory that has positive time step while
 +                     other has negative time step */
 +                    return -2;
 +                }
 +                dt_sign = 1;
 +            }
 +            else if (dt < 0)
 +            {
 +                if (dt_sign == 1)
 +                {
 +                    /* Found a place in the trajectory that has positive time step while
 +                     other has negative time step */
 +                    return -2;
 +                }
 +                dt_sign = -1;
 +            }
 +        }
 +        t = xtc_get_next_frame_time(fp, xdrs, natoms, &bOK);
 +        if (!bOK)
 +        {
 +            return -1;
 +        }
 +
 +        /* If we are before the target time and the time step is positive or 0, or we have
 +         after the target time and the time step is negative, or the difference between 
 +         the current time and the target time is bigger than dt and above all the distance between high
 +         and low is bigger than 1 frame, then do another step of binary search. Otherwise stop and check
 +         if we reached the solution */
 +        if ((((t < time && dt_sign >= 0) || (t > time && dt_sign == -1)) || ((t
 +            - time) >= dt && dt_sign >= 0)
 +            || ((time - t) >= -dt && dt_sign < 0)) && (abs(low - high)
 +            > header_size))
 +        {
 +            if (dt >= 0 && dt_sign != -1)
 +            {
 +                if (t < time)
 +                {
 +                    low = offset;
 +                }
 +                else
 +                {
 +                    high = offset;
 +                }
 +            }
 +            else if (dt <= 0 && dt_sign == -1)
 +            {
 +                if (t >= time)
 +                {
 +                    low = offset;
 +                }
 +                else
 +                {
 +                    high = offset;
 +                }
 +            }
 +            else
 +            {
 +                /* We should never reach here */
 +                return -1;
 +            }
 +            /* round to 4 bytes and subtract header*/
 +            offset = (((high + low) / 2) / XDR_INT_SIZE) * XDR_INT_SIZE;
 +            if (gmx_fseek(fp,offset,SEEK_SET))
 +            {
 +                return -1;
 +            }
 +        }
 +        else
 +        {
 +            if (abs(low - high) <= header_size)
 +            {
 +                break;
 +            }
 +            /* re-estimate dt */
 +            if (xdr_xtc_estimate_dt(fp, xdrs, natoms, &bOK) != dt)
 +            {
 +                if (bOK)
 +                {
 +                    dt = xdr_xtc_estimate_dt(fp, xdrs, natoms, &bOK);
 +                }
 +            }
 +            if (t >= time && t - time < dt)
 +            {
 +                break;
 +            }
 +        }
 +    }
 +
 +    if (offset <= header_size)
 +    {
 +        offset = low;
 +    }
 +
 +    gmx_fseek(fp,offset,SEEK_SET);
 +
 +    if ((pos = xtc_get_next_frame_start(fp, xdrs, natoms)) < 0)
 +    {
 +        return -1;
 +    }
 +
 +    if (gmx_fseek(fp,pos,SEEK_SET))
 +    {
 +        return -1;
 +    }
 +    return 0;
 +}
 +
 +float 
 +xdr_xtc_get_last_frame_time(FILE *fp, XDR *xdrs, int natoms, gmx_bool * bOK)
 +{
 +    float  time;
 +    gmx_off_t  off;
 +    int res;
 +    *bOK = 1;
 +    off = gmx_ftell(fp);  
 +    if(off < 0){
 +      *bOK = 0;
 +      return -1;
 +    }
 +    
 +    if( (res = gmx_fseek(fp,-3*XDR_INT_SIZE,SEEK_END)) != 0){
 +      *bOK = 0;
 +      return -1;
 +    }
 +
 +    time = xtc_get_current_frame_time(fp, xdrs, natoms, bOK);
 +    if(!(*bOK)){
 +      return -1;
 +    }
 +    
 +    if( (res = gmx_fseek(fp,off,SEEK_SET)) != 0){
 +      *bOK = 0;
 +      return -1;
 +    } 
 +    return time;
 +}
 +
 +
 +int
 +xdr_xtc_get_last_frame_number(FILE *fp, XDR *xdrs, int natoms, gmx_bool * bOK)
 +{
 +    int    frame;
 +    gmx_off_t  off;
 +    int res;
 +    *bOK = 1;
 +    
 +    if((off = gmx_ftell(fp)) < 0){
 +      *bOK = 0;
 +      return -1;
 +    }
 +
 +    
 +    if(gmx_fseek(fp,-3*XDR_INT_SIZE,SEEK_END)){
 +      *bOK = 0;
 +      return -1;
 +    }
 +
 +    frame = xtc_get_current_frame_number(fp, xdrs, natoms, bOK);
 +    if(!bOK){
 +      return -1;
 +    }
 +
 +
 +    if(gmx_fseek(fp,off,SEEK_SET)){
 +      *bOK = 0;
 +      return -1;
 +    }    
 +
 +    return frame;
 +}
index 678b402faf018a6ffc228269e64515064a545fb9,0000000000000000000000000000000000000000..baf41cdcd62bd8aaab54f006663fffcb5b6522d6
mode 100644,000000..100644
--- /dev/null
@@@ -1,562 -1,0 +1,567 @@@
-                      const char *name)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include "gromacs/utility/gmx_header_config.h"
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <limits.h>
 +#include <time.h>
 +
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "network.h"
 +#include "main.h"
 +#include "macros.h"
 +#include "futil.h"
 +#include "filenm.h"
 +#include "gmxfio.h"
 +#include "string2.h"
 +#include "copyrite.h"
 +
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi.h"
 +#endif
 +
 +/* The source code in this file should be thread-safe. 
 +         Please keep it that way. */
 +
 +
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +
 +#ifdef GMX_NATIVE_WINDOWS
 +#include <process.h>
 +#endif
 +
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +#define BUFSIZE       1024
 +
 +
 +static void par_fn(char *base,int ftp,const t_commrec *cr,
 +                 gmx_bool bAppendSimId,gmx_bool bAppendNodeId,
 +                 char buf[],int bufsize)
 +{
 +  int n;
 +  
 +  if((size_t)bufsize<(strlen(base)+10))
 +     gmx_mem("Character buffer too small!");
 +
 +  /* Copy to buf, and strip extension */
 +  strcpy(buf,base);
 +  buf[strlen(base) - strlen(ftp2ext(fn2ftp(base))) - 1] = '\0';
 +
 +  if (bAppendSimId) {
 +    sprintf(buf+strlen(buf),"%d",cr->ms->sim);
 +  }
 +  if (bAppendNodeId) {
 +    strcat(buf,"_node");
 +    sprintf(buf+strlen(buf),"%d",cr->nodeid);
 +  }
 +  strcat(buf,".");
 +  
 +  /* Add extension again */
 +  strcat(buf,(ftp == efTPX) ? "tpr" : (ftp == efEDR) ? "edr" : ftp2ext(ftp));
 +  if (debug)
 +  {
 +      fprintf(debug, "node %d par_fn '%s'\n",cr->nodeid,buf);
 +      if (fn2ftp(buf) == efLOG)
 +      {
 +          fprintf(debug,"log\n");
 +      }
 +  }
 +}
 +
 +void check_multi_int(FILE *log,const gmx_multisim_t *ms,int val,
-   if (NULL != log)
++                     const char *name,
++                     gmx_bool bQuiet)
 +{
 +  int  *ibuf,p;
 +  gmx_bool bCompatible;
 +
-       if (NULL != log)
++  if (NULL != log && !bQuiet)
 +      fprintf(log,"Multi-checking %s ... ",name);
 +  
 +  if (ms == NULL)
 +    gmx_fatal(FARGS,
 +            "check_multi_int called with a NULL communication pointer");
 +
 +  snew(ibuf,ms->nsim);
 +  ibuf[ms->sim] = val;
 +  gmx_sumi_sim(ms->nsim,ibuf,ms);
 +  
 +  bCompatible = TRUE;
 +  for(p=1; p<ms->nsim; p++)
 +    bCompatible = bCompatible && (ibuf[p-1] == ibuf[p]);
 +  
 +  if (bCompatible) 
 +  {
-                            gmx_large_int_t val, const char *name)
++      if (NULL != log && !bQuiet)
 +          fprintf(log,"OK\n");
 +  }
 +  else 
 +  {
 +      if (NULL != log)
 +      {
 +          fprintf(log,"\n%s is not equal for all subsystems\n",name);
 +          for(p=0; p<ms->nsim; p++)
 +              fprintf(log,"  subsystem %d: %d\n",p,ibuf[p]);
 +      }
 +      gmx_fatal(FARGS,"The %d subsystems are not compatible\n",ms->nsim);
 +  }
 +  
 +  sfree(ibuf);
 +}
 +
 +void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
-   if (NULL != log)
++                           gmx_large_int_t val, const char *name,
++                           gmx_bool bQuiet)
 +{
 +  gmx_large_int_t  *ibuf;
 +  int p;
 +  gmx_bool bCompatible;
 +
-       if (NULL != log)
++  if (NULL != log && !bQuiet)
 +      fprintf(log,"Multi-checking %s ... ",name);
 +  
 +  if (ms == NULL)
 +    gmx_fatal(FARGS,
 +            "check_multi_int called with a NULL communication pointer");
 +
 +  snew(ibuf,ms->nsim);
 +  ibuf[ms->sim] = val;
 +  gmx_sumli_sim(ms->nsim,ibuf,ms);
 +  
 +  bCompatible = TRUE;
 +  for(p=1; p<ms->nsim; p++)
 +    bCompatible = bCompatible && (ibuf[p-1] == ibuf[p]);
 +  
 +  if (bCompatible) 
 +  {
-   fprintf(stderr,"NODEID=%d argc=%d\n",cr->nodeid,*argc);
++      if (NULL != log && !bQuiet)
 +          fprintf(log,"OK\n");
 +  }
 +  else 
 +  {
 +      if (NULL != log)
 +      {
 +          fprintf(log,"\n%s is not equal for all subsystems\n",name);
 +          for(p=0; p<ms->nsim; p++)
 +          {
 +              char strbuf[255];
 +              /* first make the format string */
 +              snprintf(strbuf, 255, "  subsystem %%d: %s\n", 
 +                       gmx_large_int_pfmt);
 +              fprintf(log,strbuf,p,ibuf[p]);
 +          }
 +      }
 +      gmx_fatal(FARGS,"The %d subsystems are not compatible\n",ms->nsim);
 +  }
 +  
 +  sfree(ibuf);
 +}
 +
 +
 +char *gmx_gethostname(char *name, size_t len)
 +{
 +    if (len < 8)
 +    {
 +        gmx_incons("gmx_gethostname called with len<8");
 +    }
 +#ifdef HAVE_UNISTD_H
 +    if (gethostname(name, len-1) != 0)
 +    {
 +        strncpy(name, "unknown",8);
 +    }
 +#else
 +    strncpy(name, "unknown",8);
 +#endif
 +
 +    return name;
 +}
 +
 +
 +void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly, 
 +                  gmx_bool bAppendFiles, FILE** fplog)
 +{
 +    int  len,testlen,pid;
 +    char buf[256],host[256];
 +    time_t t;
 +    char timebuf[STRLEN];
 +    FILE *fp=*fplog;
 +    char *tmpnm;
 +  
 +    debug_gmx();
 +  
 +    /* Communicate the filename for logfile */
 +    if (cr->nnodes > 1 && !bMasterOnly
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI the non-master log files are opened later
 +         * when the files names are already known on all nodes.
 +         */
 +        && FALSE
 +#endif
 +        )
 +    {
 +        if (MASTER(cr))
 +        {
 +            len = strlen(lognm) + 1;
 +        }
 +        gmx_bcast(sizeof(len),&len,cr);
 +        if (!MASTER(cr))
 +        {
 +            snew(tmpnm,len+8);
 +        }
 +        else
 +        {
 +            tmpnm=gmx_strdup(lognm);
 +        }
 +        gmx_bcast(len*sizeof(*tmpnm),tmpnm,cr);
 +    }
 +    else
 +    {
 +        tmpnm=gmx_strdup(lognm);
 +    }
 +  
 +    debug_gmx();
 +
 +    if (!bMasterOnly && !MASTER(cr))
 +    {
 +        /* Since log always ends with '.log' let's use this info */
 +        par_fn(tmpnm,efLOG,cr,FALSE,!bMasterOnly,buf,255);
 +        fp = gmx_fio_fopen(buf, bAppendFiles ? "a+" : "w+" );
 +    }
 +    else if (!bAppendFiles)
 +    {
 +        fp = gmx_fio_fopen(tmpnm, bAppendFiles ? "a+" : "w+" );
 +    }
 +
 +    sfree(tmpnm);
 +
 +    gmx_fatal_set_log_file(fp);
 +  
 +    /* Get some machine parameters */
 +    gmx_gethostname(host,256);
 +
 +    time(&t);
 +
 +#ifndef NO_GETPID
 +#   ifdef GMX_NATIVE_WINDOWS
 +    pid = _getpid();
 +#   else
 +    pid = getpid();
 +#   endif
 +#else
 +      pid = 0;
 +#endif
 +
 +    if (bAppendFiles)
 +    {
 +        fprintf(fp,
 +                "\n"
 +                "\n"
 +                "-----------------------------------------------------------\n"
 +                "Restarting from checkpoint, appending to previous log file.\n"
 +                "\n"
 +            );
 +    }
 +      
 +    gmx_ctime_r(&t,timebuf,STRLEN);
 +
 +    fprintf(fp,
 +            "Log file opened on %s"
 +            "Host: %s  pid: %d  nodeid: %d  nnodes:  %d\n",
 +            timebuf,host,pid,cr->nodeid,cr->nnodes);
 +    gmx_print_version_info(fp);
 +    fprintf(fp, "\n\n");
 +
 +    fflush(fp);
 +    debug_gmx();
 +
 +    *fplog = fp;
 +}
 +
 +void gmx_log_close(FILE *fp)
 +{
 +  if (fp) {
 +    gmx_fatal_set_log_file(NULL);
 +    gmx_fio_fclose(fp);
 +  }
 +}
 +
 +static void comm_args(const t_commrec *cr,int *argc,char ***argv)
 +{
 +  int i,len;
 +  
 +  if (PAR(cr))
 +    gmx_bcast(sizeof(*argc),argc,cr);
 +  
 +  if (!MASTER(cr))
 +    snew(*argv,*argc+1);
++  if (debug)
++  {
++      fprintf(debug,"NODEID=%d argc=%d\n",cr->nodeid,*argc);
++  }
 +  for(i=0; (i<*argc); i++) {
 +    if (MASTER(cr))
 +      len = strlen((*argv)[i])+1;
 +    gmx_bcast(sizeof(len),&len,cr);
 +    if (!MASTER(cr))
 +      snew((*argv)[i],len);
 +    /*gmx_bcast(len*sizeof((*argv)[i][0]),(*argv)[i],cr);*/
 +    gmx_bcast(len*sizeof(char),(*argv)[i],cr);
 +  }
 +  debug_gmx();
 +}
 +
 +void init_multisystem(t_commrec *cr,int nsim, char **multidirs,
 +                      int nfile, const t_filenm fnm[],gmx_bool bParFn)
 +{
 +    gmx_multisim_t *ms;
 +    int  nnodes,nnodpersim,sim,i,ftp;
 +    char buf[256];
 +#ifdef GMX_MPI
 +    MPI_Group mpi_group_world;
 +#endif  
 +    int *rank;
 +
 +#ifndef GMX_MPI
 +    if (nsim > 1)
 +    {
 +        gmx_fatal(FARGS,"This binary is compiled without MPI support, can not do multiple simulations.");
 +    }
 +#endif
 +
 +    nnodes  = cr->nnodes;
 +    if (nnodes % nsim != 0)
 +    {
 +        gmx_fatal(FARGS,"The number of nodes (%d) is not a multiple of the number of simulations (%d)",nnodes,nsim);
 +    }
 +
 +    nnodpersim = nnodes/nsim;
 +    sim = cr->nodeid/nnodpersim;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"We have %d simulations, %d nodes per simulation, local simulation is %d\n",nsim,nnodpersim,sim);
 +    }
 +
 +    snew(ms,1);
 +    cr->ms = ms;
 +    ms->nsim = nsim;
 +    ms->sim  = sim;
 +#ifdef GMX_MPI
 +    /* Create a communicator for the master nodes */
 +    snew(rank,ms->nsim);
 +    for(i=0; i<ms->nsim; i++)
 +    {
 +        rank[i] = i*nnodpersim;
 +    }
 +    MPI_Comm_group(MPI_COMM_WORLD,&mpi_group_world);
 +    MPI_Group_incl(mpi_group_world,nsim,rank,&ms->mpi_group_masters);
 +    sfree(rank);
 +    MPI_Comm_create(MPI_COMM_WORLD,ms->mpi_group_masters,
 +                    &ms->mpi_comm_masters);
 +
 +#if !defined(GMX_THREAD_MPI) && !defined(MPI_IN_PLACE_EXISTS)
 +    /* initialize the MPI_IN_PLACE replacement buffers */
 +    snew(ms->mpb, 1);
 +    ms->mpb->ibuf=NULL;
 +    ms->mpb->libuf=NULL;
 +    ms->mpb->fbuf=NULL;
 +    ms->mpb->dbuf=NULL;
 +    ms->mpb->ibuf_alloc=0;
 +    ms->mpb->libuf_alloc=0;
 +    ms->mpb->fbuf_alloc=0;
 +    ms->mpb->dbuf_alloc=0;
 +#endif
 +
 +#endif
 +
 +    /* Reduce the intra-simulation communication */
 +    cr->sim_nodeid = cr->nodeid % nnodpersim;
 +    cr->nnodes = nnodpersim;
 +#ifdef GMX_MPI
 +    MPI_Comm_split(MPI_COMM_WORLD,sim,cr->sim_nodeid,&cr->mpi_comm_mysim);
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +    cr->nodeid = cr->sim_nodeid;
 +#endif
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"This is simulation %d",cr->ms->sim);
 +        if (PAR(cr))
 +        {
 +            fprintf(debug,", local number of nodes %d, local nodeid %d",
 +                    cr->nnodes,cr->sim_nodeid);
 +        }
 +        fprintf(debug,"\n\n");
 +    }
 +
 +    if (multidirs)
 +    {
 +        int ret;
 +        if (debug)
 +        {
 +            fprintf(debug,"Changing to directory %s\n",multidirs[cr->ms->sim]);
 +        }
 +        gmx_chdir(multidirs[cr->ms->sim]);
 +    }
 +    else if (bParFn)
 +    {
 +        /* Patch output and tpx, cpt and rerun input file names */
 +        for(i=0; (i<nfile); i++)
 +        {
 +            /* Because of possible multiple extensions per type we must look 
 +             * at the actual file name 
 +             */
 +            if (is_output(&fnm[i]) ||
 +                fnm[i].ftp == efTPX || fnm[i].ftp == efCPT ||
 +                strcmp(fnm[i].opt,"-rerun") == 0)
 +            {
 +                ftp = fn2ftp(fnm[i].fns[0]);
 +                par_fn(fnm[i].fns[0],ftp,cr,TRUE,FALSE,buf,255);
 +                sfree(fnm[i].fns[0]);
 +                fnm[i].fns[0] = gmx_strdup(buf);
 +            }
 +        }
 +    }
 +}
 +
 +t_commrec *init_par(int *argc,char ***argv_ptr)
 +{
 +    t_commrec *cr;
 +    char      **argv;
 +    int       i;
 +    gmx_bool      pe=FALSE;
 +
 +    snew(cr,1);
 +
 +    argv = argv_ptr ? *argv_ptr : NULL;
 +
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    cr->sim_nodeid = gmx_setup(argc,argv,&cr->nnodes);
 +
 +    if (!PAR(cr) && (cr->sim_nodeid != 0))
 +    {
 +        gmx_comm("(!PAR(cr) && (cr->sim_nodeid != 0))");
 +    }
 +
 +    cr->mpi_comm_mysim   = MPI_COMM_WORLD;
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#else
 +    /* These should never be accessed */
 +    cr->mpi_comm_mysim   = NULL;
 +    cr->mpi_comm_mygroup = NULL;
 +    cr->nnodes           = 1;
 +    cr->sim_nodeid       = 0;
 +#endif
 +
 +    cr->nodeid = cr->sim_nodeid;
 +
 +    cr->duty = (DUTY_PP | DUTY_PME);
 +
 +    /* Communicate arguments if parallel */
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        comm_args(cr,argc,argv_ptr);
 +    }
 +#endif /* GMX_THREAD_MPI */
 +
 +#ifdef GMX_MPI
 +#if !defined(GMX_THREAD_MPI) && !defined(MPI_IN_PLACE_EXISTS)
 +  /* initialize the MPI_IN_PLACE replacement buffers */
 +  snew(cr->mpb, 1);
 +  cr->mpb->ibuf=NULL;
 +  cr->mpb->libuf=NULL;
 +  cr->mpb->fbuf=NULL;
 +  cr->mpb->dbuf=NULL;
 +  cr->mpb->ibuf_alloc=0;
 +  cr->mpb->libuf_alloc=0;
 +  cr->mpb->fbuf_alloc=0;
 +  cr->mpb->dbuf_alloc=0;
 +#endif
 +#endif
 +
 +    return cr;
 +}
 +
 +t_commrec *init_par_threads(const t_commrec *cro)
 +{
 +#ifdef GMX_THREAD_MPI
 +    int initialized;
 +    t_commrec *cr;
 +
 +    /* make a thread-specific commrec */
 +    snew(cr,1);
 +    /* now copy the whole thing, so settings like the number of PME nodes
 +       get propagated. */
 +    *cr=*cro;
 +
 +    /* and we start setting our own thread-specific values for things */
 +    MPI_Initialized(&initialized);
 +    if (!initialized)
 +    {
 +        gmx_comm("Initializing threads without comm");
 +    }
 +    /* once threads will be used together with MPI, we'll
 +       fill the cr structure with distinct data here. This might even work: */
 +    cr->sim_nodeid = gmx_setup(0,NULL, &cr->nnodes);
 +
 +    cr->mpi_comm_mysim = MPI_COMM_WORLD;
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +    cr->nodeid = cr->sim_nodeid;
 +    cr->duty = (DUTY_PP | DUTY_PME);
 +
 +    return cr;
 +#else
 +    return NULL;
 +#endif
 +}
index a35ffd5de7e21289f778ae9ce740d2f91b6c1836,0000000000000000000000000000000000000000..bbae1df69c79a73d71297276bf43988fe7d16854
mode 100644,000000..100644
--- /dev/null
@@@ -1,1129 -1,0 +1,1131 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "mtop_util.h"
 +#include "topsort.h"
 +#include "symtab.h"
 +#include "gmx_fatal.h"
 +
 +static int gmx_mtop_maxresnr(const gmx_mtop_t *mtop,int maxres_renum)
 +{
 +    int maxresnr,mt,r;
 +    const t_atoms *atoms;
 +
 +    maxresnr = 0;
 +
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        atoms = &mtop->moltype[mt].atoms;
 +        if (atoms->nres > maxres_renum)
 +        {
 +            for(r=0; r<atoms->nres; r++)
 +            {
 +                if (atoms->resinfo[r].nr > maxresnr)
 +                {
 +                    maxresnr = atoms->resinfo[r].nr;
 +                }
 +            }
 +        }
 +    }
 +
 +    return maxresnr;
 +}
 +
 +void gmx_mtop_finalize(gmx_mtop_t *mtop)
 +{
 +    char *env;
 +
 +    mtop->maxres_renum = 1;
 +    
 +    env = getenv("GMX_MAXRESRENUM");
 +    if (env != NULL)
 +    {
 +        sscanf(env,"%d",&mtop->maxres_renum);
 +    }
 +    if (mtop->maxres_renum == -1)
 +    {
 +        /* -1 signals renumber residues in all molecules */
 +        mtop->maxres_renum = INT_MAX;
 +    }
 +
 +    mtop->maxresnr = gmx_mtop_maxresnr(mtop,mtop->maxres_renum);
 +}
 +
 +int ncg_mtop(const gmx_mtop_t *mtop)
 +{
 +    int ncg;
 +    int mb;
 +    
 +    ncg = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        ncg +=
 +            mtop->molblock[mb].nmol*
 +            mtop->moltype[mtop->molblock[mb].type].cgs.nr;
 +    }
 +    
 +    return ncg;
 +}
 +
 +void gmx_mtop_remove_chargegroups(gmx_mtop_t *mtop)
 +{
 +    int mt;
 +    t_block *cgs;
 +    int i;
 +
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        cgs = &mtop->moltype[mt].cgs;
 +        if (cgs->nr < mtop->moltype[mt].atoms.nr)
 +        {
 +            cgs->nr = mtop->moltype[mt].atoms.nr;
 +            srenew(cgs->index,cgs->nr+1);
 +            for(i=0; i<cgs->nr+1; i++)
 +            {
 +                cgs->index[i] = i;
 +            }
 +        }
 +    }
 +}
 +
 +
 +typedef struct
 +{
 +    int a_start;
 +    int a_end;
 +    int na_mol;
 +} mb_at_t;
 +
 +typedef struct gmx_mtop_atomlookup
 +{
 +    const gmx_mtop_t *mtop;
 +    int     nmb;
 +    int     mb_start;
 +    mb_at_t *mba;
 +} t_gmx_mtop_atomlookup;
 +
 +
 +gmx_mtop_atomlookup_t
 +gmx_mtop_atomlookup_init(const gmx_mtop_t *mtop)
 +{
 +    t_gmx_mtop_atomlookup *alook;
 +    int mb;
 +    int a_start,a_end,na,na_start=-1;
 +
 +    snew(alook,1);
 +
 +    alook->mtop     = mtop;
 +    alook->nmb      = mtop->nmolblock;
 +    alook->mb_start = 0;
 +    snew(alook->mba,alook->nmb);
 +
 +    a_start = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        na    = mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
 +        a_end = a_start + na;
 +
 +        alook->mba[mb].a_start = a_start;
 +        alook->mba[mb].a_end   = a_end;
 +        alook->mba[mb].na_mol  = mtop->molblock[mb].natoms_mol;
 +
 +        /* We start the binary search with the largest block */
 +        if (mb == 0 || na > na_start)
 +        {
 +            alook->mb_start = mb;
 +            na_start        = na;
 +        }
 +
 +        a_start = a_end;
 +    }
 +
 +    return alook;
 +}
 +
 +gmx_mtop_atomlookup_t
 +gmx_mtop_atomlookup_settle_init(const gmx_mtop_t *mtop)
 +{
 +     t_gmx_mtop_atomlookup *alook;
 +     int mb;
 +     int na,na_start=-1;
 +
 +     alook = gmx_mtop_atomlookup_init(mtop);
 +
 +     /* Check if the starting molblock has settle */
 +     if (mtop->moltype[mtop->molblock[alook->mb_start].type].ilist[F_SETTLE].nr  == 0)
 +     {
 +         /* Search the largest molblock with settle */
 +         alook->mb_start = -1;
 +         for(mb=0; mb<mtop->nmolblock; mb++)
 +         {
 +             if (mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE].nr > 0)
 +             {
 +                 na = alook->mba[mb].a_end - alook->mba[mb].a_start;
 +                 if (alook->mb_start == -1 || na > na_start)
 +                 {
 +                     alook->mb_start = mb;
 +                     na_start        = na;
 +                 }
 +             }
 +         }
 +
 +         if (alook->mb_start == -1)
 +         {
 +             gmx_incons("gmx_mtop_atomlookup_settle_init called without settles");
 +         }
 +     }
 +
 +     return alook;
 +}
 +
 +void
 +gmx_mtop_atomlookup_destroy(gmx_mtop_atomlookup_t alook)
 +{
 +    sfree(alook->mba);
 +    sfree(alook);
 +}
 +
 +void gmx_mtop_atomnr_to_atom(const gmx_mtop_atomlookup_t alook,
 +                             int atnr_global,
 +                             t_atom **atom)
 +{
 +    int mb0,mb1,mb;
 +    int a_start,atnr_mol;
 +
 +#ifdef DEBUG_MTOP
 +    if (atnr_global < 0 || atnr_global >= mtop->natoms)
 +    {
 +        gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
 +                  atnr_global,0,mtop->natoms-1);
 +    }
 +#endif
 +
 +    mb0 = -1;
 +    mb1 = alook->nmb;
 +    mb  = alook->mb_start;
 +        
 +    while (TRUE)
 +    {
 +        a_start = alook->mba[mb].a_start;
 +        if (atnr_global < a_start)
 +        {
 +            mb1 = mb;
 +        }
 +        else if (atnr_global >= alook->mba[mb].a_end)
 +        {
 +            mb0 = mb;
 +        }
 +        else
 +        {
 +            break;
 +        }
 +        mb = ((mb0 + mb1 + 1)>>1);
 +    }
 +    
 +    atnr_mol = (atnr_global - a_start) % alook->mba[mb].na_mol;
 +
 +    *atom = &alook->mtop->moltype[alook->mtop->molblock[mb].type].atoms.atom[atnr_mol];
 +}
 +
 +void gmx_mtop_atomnr_to_ilist(const gmx_mtop_atomlookup_t alook,
 +                              int atnr_global,
 +                              t_ilist **ilist_mol,int *atnr_offset)
 +{
 +    int mb0,mb1,mb;
 +    int a_start,atnr_local;
 +
 +#ifdef DEBUG_MTOP
 +    if (atnr_global < 0 || atnr_global >= mtop->natoms)
 +    {
 +        gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
 +                  atnr_global,0,mtop->natoms-1);
 +    }
 +#endif
 +
 +    mb0 = -1;
 +    mb1 = alook->nmb;
 +    mb  = alook->mb_start;
 +        
 +    while (TRUE)
 +    {
 +        a_start = alook->mba[mb].a_start;
 +        if (atnr_global < a_start)
 +        {
 +            mb1 = mb;
 +        }
 +        else if (atnr_global >= alook->mba[mb].a_end)
 +        {
 +            mb0 = mb;
 +        }
 +        else
 +        {
 +            break;
 +        }
 +        mb = ((mb0 + mb1 + 1)>>1);
 +    }
 +
 +    *ilist_mol = alook->mtop->moltype[alook->mtop->molblock[mb].type].ilist;
 +    
 +    atnr_local = (atnr_global - a_start) % alook->mba[mb].na_mol;
 +
 +    *atnr_offset = atnr_global - atnr_local;
 +}
 +
 +void gmx_mtop_atomnr_to_molblock_ind(const gmx_mtop_atomlookup_t alook,
 +                                     int atnr_global,
 +                                     int *molb,int *molnr,int *atnr_mol)
 +{
 +    int mb0,mb1,mb;
 +    int a_start;
 +
 +#ifdef DEBUG_MTOP
 +    if (atnr_global < 0 || atnr_global >= mtop->natoms)
 +    {
 +        gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
 +                  atnr_global,0,mtop->natoms-1);
 +    }
 +#endif
 +
 +    mb0 = -1;
 +    mb1 = alook->nmb;
 +    mb  = alook->mb_start;
 +        
 +    while (TRUE)
 +    {
 +        a_start = alook->mba[mb].a_start;
 +        if (atnr_global < a_start)
 +        {
 +            mb1 = mb;
 +        }
 +        else if (atnr_global >= alook->mba[mb].a_end)
 +        {
 +            mb0 = mb;
 +        }
 +        else
 +        {
 +            break;
 +        }
 +        mb = ((mb0 + mb1 + 1)>>1);
 +    }
 +
 +    *molb  = mb;
 +    *molnr = (atnr_global - a_start) / alook->mba[mb].na_mol;
 +    *atnr_mol = atnr_global - a_start - (*molnr)*alook->mba[mb].na_mol;
 +}
 +
 +void gmx_mtop_atominfo_global(const gmx_mtop_t *mtop,int atnr_global,
 +                              char **atomname,int *resnr,char **resname)
 +{
 +    int mb,a_start,a_end,maxresnr,at_loc;
 +    gmx_molblock_t *molb;
 +    t_atoms *atoms=NULL;
 +    
 +    if (atnr_global < 0 || atnr_global >= mtop->natoms)
 +    {
 +        gmx_fatal(FARGS,"gmx_mtop_atominfo_global was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
 +                  atnr_global,0,mtop->natoms-1);
 +    }
 +    
 +    mb = -1;
 +    a_end = 0;
 +    maxresnr = mtop->maxresnr;
 +    do
 +    {
 +        if (mb >= 0)
 +        {
 +            if (atoms->nres <= mtop->maxres_renum)
 +            {
 +                /* Single residue molecule, keep counting */
 +                maxresnr += mtop->molblock[mb].nmol*atoms->nres;
 +            }
 +        }
 +        mb++;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        a_start = a_end;
 +        a_end = a_start + mtop->molblock[mb].nmol*atoms->nr;
 +    }
 +    while (atnr_global >= a_end);
 +
 +    at_loc = (atnr_global - a_start) % atoms->nr;
 +    *atomname = *(atoms->atomname[at_loc]);
 +    if (atoms->nres > mtop->maxres_renum)
 +    {
 +        *resnr = atoms->resinfo[atoms->atom[at_loc].resind].nr;
 +    }
 +    else
 +    {
 +        /* Single residue molecule, keep counting */
 +        *resnr = maxresnr + 1 + (atnr_global - a_start)/atoms->nr*atoms->nres + atoms->atom[at_loc].resind;
 +    }
 +    *resname  = *(atoms->resinfo[atoms->atom[at_loc].resind].name);
 +}
 +
 +typedef struct gmx_mtop_atomloop_all
 +{
 +    const gmx_mtop_t *mtop;
 +    int        mblock;
 +    t_atoms    *atoms;
 +    int        mol;
 +    int        maxresnr;
 +    int        at_local;
 +    int        at_global;
 +} t_gmx_mtop_atomloop_all;
 +
 +gmx_mtop_atomloop_all_t
 +gmx_mtop_atomloop_all_init(const gmx_mtop_t *mtop)
 +{
 +    struct gmx_mtop_atomloop_all *aloop;
 +
 +    snew(aloop,1);
 +
 +    aloop->mtop         = mtop;
 +    aloop->mblock       = 0;
 +    aloop->atoms        =
 +        &mtop->moltype[mtop->molblock[aloop->mblock].type].atoms;
 +    aloop->mol          = 0;
 +    aloop->maxresnr     = mtop->maxresnr;
 +    aloop->at_local     = -1;
 +    aloop->at_global    = -1;
 +
 +    return aloop;
 +}
 +
 +static void gmx_mtop_atomloop_all_destroy(gmx_mtop_atomloop_all_t aloop)
 +{
 +    sfree(aloop);
 +}
 +
 +gmx_bool gmx_mtop_atomloop_all_next(gmx_mtop_atomloop_all_t aloop,
 +                                int *at_global,t_atom **atom)
 +{
 +    if (aloop == NULL)
 +    {
 +        gmx_incons("gmx_mtop_atomloop_all_next called without calling gmx_mtop_atomloop_all_init");
 +    }
 +
 +    aloop->at_local++;
 +    aloop->at_global++;
 +
 +    if (aloop->at_local >= aloop->atoms->nr)
 +    {
 +        if (aloop->atoms->nres <= aloop->mtop->maxres_renum)
 +        {
 +            /* Single residue molecule, increase the count with one */
 +            aloop->maxresnr += aloop->atoms->nres;
 +        }
 +        aloop->mol++;
 +        aloop->at_local = 0;
 +        if (aloop->mol >= aloop->mtop->molblock[aloop->mblock].nmol)
 +        {
 +            aloop->mblock++;
 +            if (aloop->mblock >= aloop->mtop->nmolblock)
 +            {
 +                gmx_mtop_atomloop_all_destroy(aloop);
 +                return FALSE;
 +            }
 +            aloop->atoms = &aloop->mtop->moltype[aloop->mtop->molblock[aloop->mblock].type].atoms;
 +            aloop->mol = 0;
 +        }
 +    }
 +
 +    *at_global = aloop->at_global;
 +    *atom      = &aloop->atoms->atom[aloop->at_local];
 +
 +    return TRUE;
 +}
 +
 +void gmx_mtop_atomloop_all_names(gmx_mtop_atomloop_all_t aloop,
 +                                 char **atomname,int *resnr,char **resname)
 +{
 +    int resind_mol;
 +
 +    *atomname = *(aloop->atoms->atomname[aloop->at_local]);
 +    resind_mol = aloop->atoms->atom[aloop->at_local].resind;
 +    *resnr = aloop->atoms->resinfo[resind_mol].nr;
 +    if (aloop->atoms->nres <= aloop->mtop->maxres_renum)
 +    {
 +        *resnr = aloop->maxresnr + 1 + resind_mol;
 +    }
 +    *resname  = *(aloop->atoms->resinfo[resind_mol].name);
 +}
 +
 +void gmx_mtop_atomloop_all_moltype(gmx_mtop_atomloop_all_t aloop,
 +                                   gmx_moltype_t **moltype,int *at_mol)
 +{
 +    *moltype = &aloop->mtop->moltype[aloop->mtop->molblock[aloop->mblock].type];
 +    *at_mol  = aloop->at_local;
 +}
 +
 +typedef struct gmx_mtop_atomloop_block
 +{
 +    const gmx_mtop_t *mtop;
 +    int        mblock;
 +    t_atoms    *atoms;
 +    int        at_local;
 +} t_gmx_mtop_atomloop_block;
 +
 +gmx_mtop_atomloop_block_t
 +gmx_mtop_atomloop_block_init(const gmx_mtop_t *mtop)
 +{
 +    struct gmx_mtop_atomloop_block *aloop;
 +
 +    snew(aloop,1);
 +
 +    aloop->mtop      = mtop;
 +    aloop->mblock    = 0;
 +    aloop->atoms     = &mtop->moltype[mtop->molblock[aloop->mblock].type].atoms;
 +    aloop->at_local  = -1;
 +
 +    return aloop;
 +}
 +
 +static void gmx_mtop_atomloop_block_destroy(gmx_mtop_atomloop_block_t aloop)
 +{
 +    sfree(aloop);
 +}
 +
 +gmx_bool gmx_mtop_atomloop_block_next(gmx_mtop_atomloop_block_t aloop,
 +                                  t_atom **atom,int *nmol)
 +{
 +    if (aloop == NULL)
 +    {
 +        gmx_incons("gmx_mtop_atomloop_all_next called without calling gmx_mtop_atomloop_all_init");
 +    }
 +
 +    aloop->at_local++;
 +
 +    if (aloop->at_local >= aloop->atoms->nr)
 +    {
 +        aloop->mblock++;
 +        if (aloop->mblock >= aloop->mtop->nmolblock)
 +        {
 +            gmx_mtop_atomloop_block_destroy(aloop);
 +            return FALSE;
 +        }
 +        aloop->atoms = &aloop->mtop->moltype[aloop->mtop->molblock[aloop->mblock].type].atoms;
 +        aloop->at_local = 0;
 +    }
 +    
 +    *atom = &aloop->atoms->atom[aloop->at_local];
 +    *nmol = aloop->mtop->molblock[aloop->mblock].nmol;
 +   
 +    return TRUE;
 +}
 +
 +typedef struct gmx_mtop_ilistloop
 +{
 +    const gmx_mtop_t *mtop;
 +    int           mblock;
 +} t_gmx_mtop_ilist;
 +
 +gmx_mtop_ilistloop_t
 +gmx_mtop_ilistloop_init(const gmx_mtop_t *mtop)
 +{
 +    struct gmx_mtop_ilistloop *iloop;
 +
 +    snew(iloop,1);
 +
 +    iloop->mtop      = mtop;
 +    iloop->mblock    = -1;
 +
 +    return iloop;
 +}
 +
 +static void gmx_mtop_ilistloop_destroy(gmx_mtop_ilistloop_t iloop)
 +{
 +    sfree(iloop);
 +}
 +
 +gmx_bool gmx_mtop_ilistloop_next(gmx_mtop_ilistloop_t iloop,
 +                             t_ilist **ilist_mol,int *nmol)
 +{
 +    if (iloop == NULL)
 +    {
 +        gmx_incons("gmx_mtop_ilistloop_next called without calling gmx_mtop_ilistloop_init");
 +    }
 +
 +    iloop->mblock++;
 +    if (iloop->mblock == iloop->mtop->nmolblock)
 +    {
 +        gmx_mtop_ilistloop_destroy(iloop);
 +        return FALSE;
 +    }
 +
 +    *ilist_mol =
 +        iloop->mtop->moltype[iloop->mtop->molblock[iloop->mblock].type].ilist;
 +
 +    *nmol = iloop->mtop->molblock[iloop->mblock].nmol;
 +
 +    return TRUE;
 +}
 +typedef struct gmx_mtop_ilistloop_all
 +{
 +    const gmx_mtop_t *mtop;
 +    int           mblock;
 +    int           mol;
 +    int           a_offset;
 +} t_gmx_mtop_ilist_all;
 +
 +gmx_mtop_ilistloop_all_t
 +gmx_mtop_ilistloop_all_init(const gmx_mtop_t *mtop)
 +{
 +    struct gmx_mtop_ilistloop_all *iloop;
 +
 +    snew(iloop,1);
 +
 +    iloop->mtop      = mtop;
 +    iloop->mblock    = 0;
 +    iloop->mol       = -1;
 +    iloop->a_offset  = 0;
 +
 +    return iloop;
 +}
 +
 +static void gmx_mtop_ilistloop_all_destroy(gmx_mtop_ilistloop_all_t iloop)
 +{
 +    sfree(iloop);
 +}
 +
 +gmx_bool gmx_mtop_ilistloop_all_next(gmx_mtop_ilistloop_all_t iloop,
 +                                 t_ilist **ilist_mol,int *atnr_offset)
 +{
 +    gmx_molblock_t *molb;
 +
 +    if (iloop == NULL)
 +    {
 +        gmx_incons("gmx_mtop_ilistloop_all_next called without calling gmx_mtop_ilistloop_all_init");
 +    }
 +    
 +    if (iloop->mol >= 0)
 +    {
 +        iloop->a_offset += iloop->mtop->molblock[iloop->mblock].natoms_mol;
 +    }
 +
 +    iloop->mol++;
 +
 +    if (iloop->mol >= iloop->mtop->molblock[iloop->mblock].nmol) {
 +        iloop->mblock++;
 +        iloop->mol = 0;
 +        if (iloop->mblock == iloop->mtop->nmolblock)
 +        {
 +            gmx_mtop_ilistloop_all_destroy(iloop);
 +            return FALSE;
 +        }
 +    }
 +    
 +    *ilist_mol =
 +        iloop->mtop->moltype[iloop->mtop->molblock[iloop->mblock].type].ilist;
 +
 +    *atnr_offset = iloop->a_offset;
 +
 +    return TRUE;
 +}
 +
 +int gmx_mtop_ftype_count(const gmx_mtop_t *mtop,int ftype)
 +{
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    int n,nmol;
 +
 +    n = 0;
 +
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        n += nmol*il[ftype].nr/(1+NRAL(ftype));
 +    }
 +
 +    return n;
 +}
 +
 +t_block gmx_mtop_global_cgs(const gmx_mtop_t *mtop)
 +{
 +    t_block cgs_gl,*cgs_mol;
 +    int mb,mol,cg;
 +    gmx_molblock_t *molb;
 +    t_atoms *atoms;
 +    
 +    /* In most cases this is too much, but we realloc at the end */
 +    snew(cgs_gl.index,mtop->natoms+1);
 +    
 +    cgs_gl.nr       = 0;
 +    cgs_gl.index[0] = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb    = &mtop->molblock[mb];
 +        cgs_mol = &mtop->moltype[molb->type].cgs;
 +        for(mol=0; mol<molb->nmol; mol++)
 +        {
 +            for(cg=0; cg<cgs_mol->nr; cg++)
 +            {
 +                cgs_gl.index[cgs_gl.nr+1] =
 +                    cgs_gl.index[cgs_gl.nr] +
 +                    cgs_mol->index[cg+1] - cgs_mol->index[cg];
 +                cgs_gl.nr++;
 +            }
 +        }
 +    }
 +    cgs_gl.nalloc_index = cgs_gl.nr + 1;
 +    srenew(cgs_gl.index,cgs_gl.nalloc_index);
 +
 +    return cgs_gl;
 +}
 +
 +static void atomcat(t_atoms *dest, t_atoms *src, int copies,
 +                    int maxres_renum, int *maxresnr)
 +{
 +    int i,j,l,size;
 +    int srcnr=src->nr;
 +    int destnr=dest->nr;
 +
 +    if (srcnr)
 +    {
 +        size=destnr+copies*srcnr;
 +        srenew(dest->atom,size);
 +        srenew(dest->atomname,size);
 +        srenew(dest->atomtype,size);
 +        srenew(dest->atomtypeB,size);
 +    }
 +    if (src->nres)
 +    {
 +        size=dest->nres+copies*src->nres;
 +        srenew(dest->resinfo,size);
 +    }
 +    
 +    /* residue information */
 +    for (l=dest->nres,j=0; (j<copies); j++,l+=src->nres)
 +    {
 +        memcpy((char *) &(dest->resinfo[l]),(char *) &(src->resinfo[0]),
 +               (size_t)(src->nres*sizeof(src->resinfo[0])));
 +    }
 +    
 +    for (l=destnr,j=0; (j<copies); j++,l+=srcnr)
 +    {
 +        memcpy((char *) &(dest->atomname[l]),(char *) &(src->atomname[0]),
 +               (size_t)(srcnr*sizeof(src->atomname[0])));
 +        memcpy((char *) &(dest->atomtype[l]),(char *) &(src->atomtype[0]),
 +               (size_t)(srcnr*sizeof(src->atomtype[0])));
 +        memcpy((char *) &(dest->atomtypeB[l]),(char *) &(src->atomtypeB[0]),
 +               (size_t)(srcnr*sizeof(src->atomtypeB[0])));
 +        memcpy((char *) &(dest->atom[l]),(char *) &(src->atom[0]),
 +               (size_t)(srcnr*sizeof(src->atom[0])));
 +    }
 +    
 +    /* Increment residue indices */
 +    for (l=destnr,j=0; (j<copies); j++)
 +    {
 +        for (i=0; (i<srcnr); i++,l++)
 +        {
 +            dest->atom[l].resind = dest->nres+j*src->nres+src->atom[i].resind;
 +        }
 +    }    
 +    
 +    if (src->nres <= maxres_renum)
 +    {
 +        /* Single residue molecule, continue counting residues */
 +        for (j=0; (j<copies); j++)
 +        {
 +            for (l=0; l<src->nres; l++)
 +            {
 +                (*maxresnr)++;
 +                dest->resinfo[dest->nres+j*src->nres+l].nr = *maxresnr;
 +            }
 +        }
 +    }
 +    
 +    dest->nres += copies*src->nres;
 +    dest->nr   += copies*src->nr;
 +}
 +
 +t_atoms gmx_mtop_global_atoms(const gmx_mtop_t *mtop)
 +{
 +    t_atoms atoms;
 +    int maxresnr,mb;
 +    gmx_molblock_t *molb;
 +
 +    init_t_atoms(&atoms,0,FALSE);
 +
 +    maxresnr = mtop->maxresnr;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        atomcat(&atoms,&mtop->moltype[molb->type].atoms,molb->nmol,
 +                mtop->maxres_renum,&maxresnr);
 +    }
 +    
 +    return atoms;
 +}
 +
 +void gmx_mtop_make_atomic_charge_groups(gmx_mtop_t *mtop,
 +                                        gmx_bool bKeepSingleMolCG)
 +{
 +    int     mb,cg;
 +    t_block *cgs_mol;
 +    
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        cgs_mol = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        if (!(bKeepSingleMolCG && cgs_mol->nr == 1))
 +        {
 +            cgs_mol->nr           = mtop->molblock[mb].natoms_mol;
 +            cgs_mol->nalloc_index = cgs_mol->nr + 1;
 +            srenew(cgs_mol->index,cgs_mol->nalloc_index);
 +            for(cg=0; cg<cgs_mol->nr+1; cg++)
 +            {
 +                cgs_mol->index[cg] = cg;
 +            }
 +        }
 +    }
 +}
 +
 +/*
 + * The cat routines below are old code from src/kernel/topcat.c
 + */ 
 +
 +static void blockcat(t_block *dest,t_block *src,int copies, 
 +                     int dnum,int snum)
 +{
 +    int i,j,l,nra,size;
 +    
 +    if (src->nr)
 +    {
 +        size=(dest->nr+copies*src->nr+1);
 +        srenew(dest->index,size);
 +    }
 +    
 +    nra = dest->index[dest->nr];
 +    for (l=dest->nr,j=0; (j<copies); j++)
 +    {
 +        for (i=0; (i<src->nr); i++)
 +        {
 +            dest->index[l++] = nra + src->index[i];
 +        }
 +        nra += src->index[src->nr];
 +    }
 +    dest->nr += copies*src->nr;
 +    dest->index[dest->nr] = nra;
 +}
 +
 +static void blockacat(t_blocka *dest,t_blocka *src,int copies, 
 +                      int dnum,int snum)
 +{
 +    int i,j,l,size;
 +    int destnr  = dest->nr;
 +    int destnra = dest->nra;
 +    
 +    if (src->nr)
 +    {
 +        size=(dest->nr+copies*src->nr+1);
 +        srenew(dest->index,size);
 +    }
 +    if (src->nra)
 +    {
 +        size=(dest->nra+copies*src->nra);
 +        srenew(dest->a,size);
 +    }
 +    
 +    for (l=destnr,j=0; (j<copies); j++)
 +    {
 +        for (i=0; (i<src->nr); i++)
 +        {
 +            dest->index[l++] = dest->nra+src->index[i];
 +        }
 +        dest->nra += src->nra;
 +    }
 +    for (l=destnra,j=0; (j<copies); j++)
 +    {
 +        for (i=0; (i<src->nra); i++)
 +        {
 +            dest->a[l++] = dnum+src->a[i];
 +        }
 +        dnum+=snum;
 +        dest->nr += src->nr;
 +    }
 +    dest->index[dest->nr] = dest->nra;
 +}
 +
 +static void ilistcat(int ftype,t_ilist *dest,t_ilist *src,int copies, 
 +                     int dnum,int snum)
 +{
 +    int nral,c,i,a;
 +
 +    nral = NRAL(ftype);
 +
 +    dest->nalloc = dest->nr + copies*src->nr;
 +    srenew(dest->iatoms,dest->nalloc);
 +
 +    for(c=0; c<copies; c++)
 +    {
 +        for(i=0; i<src->nr; )
 +        {
 +            dest->iatoms[dest->nr++] = src->iatoms[i++];
 +            for(a=0; a<nral; a++)
 +            {
 +                dest->iatoms[dest->nr++] = dnum + src->iatoms[i++];
 +            }
 +        }
 +        dnum += snum;
 +    }
 +}
 +
 +static void set_posres_params(t_idef *idef,gmx_molblock_t *molb,
 +                              int i0,int a_offset)
 +{
 +    t_ilist *il;
 +    int i1,i,a_molb;
 +    t_iparams *ip;
 +
 +    il = &idef->il[F_POSRES];
 +    i1 = il->nr/2;
 +    idef->iparams_posres_nalloc = i1;
 +    srenew(idef->iparams_posres,idef->iparams_posres_nalloc);
 +    for(i=i0; i<i1; i++)
 +    {
 +        ip = &idef->iparams_posres[i];
 +        /* Copy the force constants */
 +        *ip    = idef->iparams[il->iatoms[i*2]];
 +        a_molb = il->iatoms[i*2+1] - a_offset;
 +        if (molb->nposres_xA == 0)
 +        {
 +            gmx_incons("Position restraint coordinates are missing");
 +        }
 +        ip->posres.pos0A[XX] = molb->posres_xA[a_molb][XX];
 +        ip->posres.pos0A[YY] = molb->posres_xA[a_molb][YY];
 +        ip->posres.pos0A[ZZ] = molb->posres_xA[a_molb][ZZ];
 +        if (molb->nposres_xB > 0)
 +        {
 +            ip->posres.pos0B[XX] = molb->posres_xB[a_molb][XX];
 +            ip->posres.pos0B[YY] = molb->posres_xB[a_molb][YY];
 +            ip->posres.pos0B[ZZ] = molb->posres_xB[a_molb][ZZ];
 +        }
 +        else
 +        {
 +            ip->posres.pos0B[XX] = ip->posres.pos0A[XX];
 +            ip->posres.pos0B[YY] = ip->posres.pos0A[YY];
 +            ip->posres.pos0B[ZZ] = ip->posres.pos0A[ZZ];
 +        }
 +        /* Set the parameter index for idef->iparams_posre */
 +        il->iatoms[i*2] = i;
 +    }
 +}
 +
 +static void set_fbposres_params(t_idef *idef,gmx_molblock_t *molb,
 +                              int i0,int a_offset)
 +{
 +    t_ilist *il;
 +    int i1,i,a_molb;
 +    t_iparams *ip;
 +
 +    il = &idef->il[F_FBPOSRES];
 +    i1 = il->nr/2;
 +    idef->iparams_fbposres_nalloc = i1;
 +    srenew(idef->iparams_fbposres,idef->iparams_fbposres_nalloc);
 +    for(i=i0; i<i1; i++)
 +    {
 +        ip = &idef->iparams_fbposres[i];
 +        /* Copy the force constants */
 +        *ip    = idef->iparams[il->iatoms[i*2]];
 +        a_molb = il->iatoms[i*2+1] - a_offset;
 +        if (molb->nposres_xA == 0)
 +        {
 +            gmx_incons("Position restraint coordinates are missing");
 +        }
 +        /* Take flat-bottom posres reference from normal position restraints */
 +        ip->fbposres.pos0[XX] = molb->posres_xA[a_molb][XX];
 +        ip->fbposres.pos0[YY] = molb->posres_xA[a_molb][YY];
 +        ip->fbposres.pos0[ZZ] = molb->posres_xA[a_molb][ZZ];
 +        /* Note: no B-type for flat-bottom posres */
 +
 +        /* Set the parameter index for idef->iparams_posre */
 +        il->iatoms[i*2] = i;
 +    }
 +}
 +
 +static void gen_local_top(const gmx_mtop_t *mtop,const t_inputrec *ir,
 +                          gmx_bool bMergeConstr,
 +                          gmx_localtop_t *top)
 +{
 +    int mb,srcnr,destnr,ftype,ftype_dest,mt,natoms,mol,nposre_old,nfbposre_old;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t *molt;
 +    const gmx_ffparams_t *ffp;
 +    t_idef *idef;
 +    real   *qA,*qB;
 +    gmx_mtop_atomloop_all_t aloop;
 +    int    ag;
 +    t_atom *atom;
 +
 +    top->atomtypes = mtop->atomtypes;
 +    
 +    ffp = &mtop->ffparams;
 +    
 +    idef = &top->idef;
 +    idef->ntypes   = ffp->ntypes;
 +    idef->atnr     = ffp->atnr;
 +    idef->functype = ffp->functype;
 +    idef->iparams  = ffp->iparams;
 +    idef->iparams_posres = NULL;
 +    idef->iparams_posres_nalloc = 0;
 +    idef->iparams_fbposres = NULL;
 +    idef->iparams_fbposres_nalloc = 0;
 +    idef->fudgeQQ  = ffp->fudgeQQ;
 +    idef->cmap_grid = ffp->cmap_grid;
 +    idef->ilsort   = ilsortUNKNOWN;
 +
 +    init_block(&top->cgs);
 +    init_blocka(&top->excls);
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        idef->il[ftype].nr     = 0;
 +        idef->il[ftype].nalloc = 0;
 +        idef->il[ftype].iatoms = NULL;
 +    }
 +
 +    natoms = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        
 +        srcnr  = molt->atoms.nr;
 +        destnr = natoms;
 +        
 +        blockcat(&top->cgs,&molt->cgs,molb->nmol,destnr,srcnr);
 +
 +        blockacat(&top->excls,&molt->excls,molb->nmol,destnr,srcnr);
 +
 +        nposre_old = idef->il[F_POSRES].nr;
 +        nfbposre_old = idef->il[F_FBPOSRES].nr;
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (bMergeConstr &&
 +                ftype == F_CONSTR && molt->ilist[F_CONSTRNC].nr > 0)
 +            {
 +                /* Merge all constrains into one ilist.
 +                 * This simplifies the constraint code.
 +                 */
 +                for(mol=0; mol<molb->nmol; mol++) {
 +                    ilistcat(ftype,&idef->il[F_CONSTR],&molt->ilist[F_CONSTR],
 +                             1,destnr+mol*srcnr,srcnr);
 +                    ilistcat(ftype,&idef->il[F_CONSTR],&molt->ilist[F_CONSTRNC],
 +                             1,destnr+mol*srcnr,srcnr);
 +                }
 +            }
 +            else if (!(bMergeConstr && ftype == F_CONSTRNC))
 +            {
 +                ilistcat(ftype,&idef->il[ftype],&molt->ilist[ftype],
 +                         molb->nmol,destnr,srcnr);
 +            }
 +        }
 +        if (idef->il[F_POSRES].nr > nposre_old)
 +        {
++            /* Executing this line line stops gmxdump -sys working
++             * correctly. I'm not aware there's an elegant fix. */
 +            set_posres_params(idef,molb,nposre_old/2,natoms);
 +        }
 +        if (idef->il[F_FBPOSRES].nr > nfbposre_old)
 +        {
 +            set_fbposres_params(idef,molb,nfbposre_old/2,natoms);
 +        }
 +
 +        natoms += molb->nmol*srcnr;
 +    }
 +
 +    if (ir == NULL)
 +    {
 +        top->idef.ilsort = ilsortUNKNOWN;
 +    }
 +    else
 +    {
 +        if (ir->efep != efepNO && gmx_mtop_bondeds_free_energy(mtop))
 +        {
 +            snew(qA,mtop->natoms);
 +            snew(qB,mtop->natoms);
 +            aloop = gmx_mtop_atomloop_all_init(mtop);
 +            while (gmx_mtop_atomloop_all_next(aloop,&ag,&atom))
 +            {
 +                qA[ag] = atom->q;
 +                qB[ag] = atom->qB;
 +            }
 +            gmx_sort_ilist_fe(&top->idef,qA,qB);
 +            sfree(qA);
 +            sfree(qB);
 +        }
 +        else
 +        {
 +            top->idef.ilsort = ilsortNO_FE;
 +        }
 +    }
 +}
 +
 +gmx_localtop_t *gmx_mtop_generate_local_top(const gmx_mtop_t *mtop,
 +                                            const t_inputrec *ir)
 +{
 +    gmx_localtop_t *top;
 +
 +    snew(top,1);
 +
 +    gen_local_top(mtop,ir,TRUE,top);
 +
 +    return top;
 +}
 +
 +t_topology gmx_mtop_t_to_t_topology(gmx_mtop_t *mtop)
 +{
 +    int mt,mb;
 +    gmx_localtop_t ltop;
 +    t_topology top;
 +
 +    gen_local_top(mtop,NULL,FALSE,&ltop);
 +
 +    top.name      = mtop->name;
 +    top.idef      = ltop.idef;
 +    top.atomtypes = ltop.atomtypes;
 +    top.cgs       = ltop.cgs;
 +    top.excls     = ltop.excls;
 +    top.atoms     = gmx_mtop_global_atoms(mtop);
 +    top.mols      = mtop->mols;
 +    top.symtab    = mtop->symtab;
 +
 +    /* We only need to free the moltype and molblock data,
 +     * all other pointers have been copied to top.
 +     *
 +     * Well, except for the group data, but we can't free those, because they
 +     * are used somewhere even after a call to this function.
 +     */
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        done_moltype(&mtop->moltype[mt]);
 +    }
 +    sfree(mtop->moltype);
 +
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        done_molblock(&mtop->molblock[mb]);
 +    }
 +    sfree(mtop->molblock);
 +
 +    return top;
 +}
index 356785630b2a155d3b60fdceaeebe5911d12acef,0000000000000000000000000000000000000000..dcc917329173e491f88ed910cccfabef9bc62439
mode 100644,000000..100644
--- /dev/null
@@@ -1,271 -1,0 +1,275 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "names.h"
 +
 +/* note: these arrays should correspond to enums in include/types/enums.h */
 +
 +const char *epbc_names[epbcNR+1]=
 +{
 +  "xyz", "no", "xy", "screw", NULL
 +};
 +
 +const char *ens_names[ensNR+1]=
 +{
 +  "Grid","Simple", NULL
 +};
 +
 +const char *ei_names[eiNR+1]=
 +{
 +  "md", "steep", "cg", "bd", "sd", "nm", "l-bfgs", "tpi", "tpic", "sd1", "md-vv", "md-vv-avek",NULL 
 +};
 +
 +const char *bool_names[BOOL_NR+1]=
 +{
 +  "FALSE","TRUE", NULL
 +};
 +
 +const char *yesno_names[BOOL_NR+1]=
 +{
 +  "no","yes", NULL
 +};
 +
 +const char *ptype_str[eptNR+1] = {
 +  "Atom", "Nucleus", "Shell", "Bond", "VSite", NULL
 +};
 +
 +const char *ecutscheme_names[ecutsNR+1] = {
 +  "Group", "Verlet", NULL
 +};
 +
 +const char *eel_names[eelNR+1] = {
 +  "Cut-off", "Reaction-Field", "Generalized-Reaction-Field",
 +  "PME", "Ewald", "P3M-AD", "Poisson", "Switch", "Shift", "User", 
 +  "Generalized-Born", "Reaction-Field-nec", "Encad-shift", 
 +  "PME-User", "PME-Switch", "PME-User-Switch", 
 +  "Reaction-Field-zero", NULL
 +};
 +
 +const char *eewg_names[eewgNR+1] = {
 +  "3d", "3dc", NULL
 +};
 +
 +const char *evdw_names[evdwNR+1] = {
 +  "Cut-off", "Switch", "Shift", "User", "Encad-shift", NULL
 +};
 +
 +const char *econstr_names[econtNR+1] = {
 +  "Lincs", "Shake", NULL
 +};
 +
 +const char *eintmod_names[eintmodNR+1] = { 
 +  "Potential-shift-Verlet","Potential-shift","None","Potential-switch","Exact-cutoff", NULL
 +};
 +
 +const char *egrp_nm[egNR+1] = { 
 +  "Coul-SR","LJ-SR","Buck-SR", "Coul-LR", "LJ-LR", "Buck-LR",
 +  "Coul-14", "LJ-14", NULL
 +};
 +
 +const char *etcoupl_names[etcNR+1] = {
 +  "No", "Berendsen", "Nose-Hoover", "yes", "Andersen", "Andersen-massive", "V-rescale", NULL
 +}; /* yes is alias for berendsen */
 +
 +const char *epcoupl_names[epcNR+1] = {
 +  "No", "Berendsen", "Parrinello-Rahman", "Isotropic", "MTTK", NULL
 +}; /* isotropic is alias for berendsen */
 +
 +const char *epcoupltype_names[epctNR+1] = {
 +  "Isotropic", "Semiisotropic", "Anisotropic", "Surface-Tension", NULL
 +};
 +
 +const char *erefscaling_names[erscNR+1] = {
 +  "No", "All", "COM", NULL
 +};
 +
 +const char *edisre_names[edrNR+1] = {
 +  "No", "Simple", "Ensemble", NULL
 +};
 +
 +const char *edisreweighting_names[edrwNR+1] = {
 +  "Conservative", "Equal", NULL
 +};
 +
 +const char *enbf_names[eNBF_NR+1] = {
 +  "", "LJ", "Buckingham", NULL
 +};
 +
 +const char *ecomb_names[eCOMB_NR+1] = {
 +  "", "Geometric", "Arithmetic", "GeomSigEps", NULL
 +};
 +
 +const char *gtypes[egcNR+1] = {
 +  "T-Coupling", "Energy Mon.", "Acceleration", "Freeze",
 +  "User1", "User2", "VCM", "XTC", "Or. Res. Fit", "QMMM", NULL
 +};
 +
 +const char *esimtemp_names[esimtempNR+1] = {
 +  "geometric", "exponential", "linear", NULL
 +};
 +
 +const char *efep_names[efepNR+1] = {
 +  "no", "yes", "static", "slow-growth", "expanded", NULL
 +};
 +
 +const char *efpt_names[efptNR+1] = {
 +  "fep-lambdas", "mass-lambdas", "coul-lambdas", "vdw-lambdas", "bonded-lambdas", "restraint-lambdas", "temperature-lambdas", NULL
 +};
 +
++const char *efpt_singular_names[efptNR+1] = {
++  "fep-lambda", "mass-lambda", "coul-lambda", "vdw-lambda", "bonded-lambda", "restraint-lambda", "temperature-lambda", NULL
++};
++
 +const char *elamstats_names[elamstatsNR+1] = {
 +  "no", "metropolis-transition", "barker-transition", "minvar", "wang-landau", "weighted-wang-landau", NULL
 +};
 +
 +const char *elmcmove_names[elmcmoveNR+1] = {
 +  "no", "metropolis", "barker", "gibbs", "metropolized-gibbs", NULL
 +};
 +
 +const char *elmceq_names[elmceqNR+1] = {
 +  "no", "yes", "wl-delta", "number-all-lambda", "number-steps", "number-samples", "count-ratio", NULL
 +};
 +
 +const char *separate_dhdl_file_names[esepdhdlfileNR+1] = {
 +  "yes", "no", NULL
 +};
 +
 +const char *dhdl_derivatives_names[edhdlderivativesNR+1] = {
 +  "yes", "no", NULL
 +};
 +
 +const char *esol_names[esolNR+1] = {
 +  "No", "SPC", "TIP4p", NULL
 +};
 +
 +const char *edispc_names[edispcNR+1] = {
 +  "No", "EnerPres", "Ener", "AllEnerPres", "AllEner", NULL
 +};
 +
 +const char *ecm_names[ecmNR+1] = { 
 +  "Linear", "Angular", "None", NULL 
 +};
 +
 +const char *eann_names[eannNR+1] = {
 +  "No", "Single", "Periodic", NULL
 +};
 +
 +const char *eis_names[eisNR+1] = {
 +      "No", "GBSA", NULL
 +};
 +
 +const char *egb_names[egbNR+1] = {
 +  "Still", "HCT", "OBC", NULL
 +};
 +
 +const char *esa_names[esaNR+1] = {
 +  "Ace-approximation", "None", "Still", NULL
 +};
 +
 +const char *ewt_names[ewtNR+1] = {
 +  "9-3", "10-4", "table", "12-6", NULL
 +};
 +
 +const char *epull_names[epullNR+1] = { 
 +  "no", "umbrella", "constraint", "constant-force", NULL
 +};
 +
 +const char *epullg_names[epullgNR+1] = { 
 +  "distance", "direction", "cylinder", "position", "direction-periodic", NULL
 +};
 +
 +const char *erotg_names[erotgNR+1] = { 
 +  "iso", "iso-pf", "pm", "pm-pf", "rm", "rm-pf", "rm2", "rm2-pf", "flex", "flex-t", "flex2", "flex2-t", NULL
 +};
 +
 +const char *erotg_fitnames[erotgFitNR+1] = { 
 +  "rmsd", "norm", "potential", NULL
 +};
 +
 +const char *eQMmethod_names[eQMmethodNR+1] = {
 +  "AM1", "PM3", "RHF",
 +  "UHF", "DFT", "B3LYP", "MP2", "CASSCF","B3LYPLAN",
 +  "DIRECT", NULL
 +};
 +
 +const char *eQMbasis_names[eQMbasisNR+1] = {
 +  "STO3G", "STO-3G", "3-21G",
 +  "3-21G*", "3-21+G*", "6-21G",
 +  "6-31G", "6-31G*", "6-31+G*",
 +  "6-311G", NULL
 +};
 +
 +const char *eQMMMscheme_names[eQMMMschemeNR+1] = {
 +  "normal", "ONIOM", NULL
 +};
 +
 +const char *eMultentOpt_names[eMultentOptNR+1] = {
 +  "multiple_entries", "no", "use_last", NULL
 +};
 +
 +const char *eAdresstype_names[eAdressNR+1] = {
 +  "off","constant", "xsplit", "sphere", NULL 
 +};
 +
 +const char *eAdressICtype_names[eAdressICNR+1] = {
 +  "off", "thermoforce", NULL 
 +};
 +
 +const char *eAdressSITEtype_names[eAdressSITENR+1] = {
 +  "com","cog", "atom", "atomperatom", NULL
 +};
 +
 +const char *gmx_nblist_geometry_names[GMX_NBLIST_GEOMETRY_NR+1] = {
 +    "Particle-Particle", "Water3-Particle", "Water3-Water3", "Water4-Particle", "Water4-Water4", "CG-CG", NULL
 +};
 +
 +const char *gmx_nbkernel_elec_names[GMX_NBKERNEL_ELEC_NR+1] =
 +{
 +    "None", "Coulomb", "Reaction-Field", "Cubic-Spline-Table", "Generalized-Born", "Ewald", NULL
 +};
 +
 +const char *gmx_nbkernel_vdw_names[GMX_NBKERNEL_VDW_NR+1] =
 +{
 +    "None", "Lennard-Jones", "Buckingham", "Cubic-Spline-Table", NULL
 +};
 +
 +
 +
index 9921ef31f79ad2e61dca101cc2c854e4f121d059,0000000000000000000000000000000000000000..ec210de8f73881e3be8d3dedd5d90c41997524f1
mode 100644,000000..100644
--- /dev/null
@@@ -1,774 -1,0 +1,777 @@@
-   fprintf(stderr,"NNODES=%d, MYRANK=%d, HOSTNAME=%s\n",
-         mpi_num_nodes,mpi_my_rank,mpi_hostname);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "gmx_fatal.h"
 +#include "main.h"
 +#include "smalloc.h"
 +#include "network.h"
 +#include "copyrite.h"
 +#include "statutil.h"
 +#include <ctype.h>
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +
 +/* The source code in this file should be thread-safe. 
 +      Please keep it that way. */
 +
 +gmx_bool gmx_mpi_initialized(void)
 +{
 +  int n;
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  MPI_Initialized(&n);
 +  
 +  return n;
 +#endif
 +}
 +
 +int gmx_setup(int *argc,char **argv,int *nnodes)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_setup");
 +  return 0;
 +#else
 +  char   buf[256];
 +  int    resultlen;               /* actual length of node name      */
 +  int    i,flag;
 +  int  mpi_num_nodes;
 +  int  mpi_my_rank;
 +  char mpi_hostname[MPI_MAX_PROCESSOR_NAME];
 +
 +  /* Call the MPI routines */
 +#ifdef GMX_LIB_MPI
 +#ifdef GMX_FAHCORE
 +  (void) fah_MPI_Init(argc,&argv);
 +#else
 +  (void) MPI_Init(argc,&argv);
 +#endif
 +#endif
 +  (void) MPI_Comm_size( MPI_COMM_WORLD, &mpi_num_nodes );
 +  (void) MPI_Comm_rank( MPI_COMM_WORLD, &mpi_my_rank );
 +  (void) MPI_Get_processor_name( mpi_hostname, &resultlen );
 + 
 +#ifdef GMX_LIB_MPI 
++  if (debug)
++  {
++      fprintf(debug,"NNODES=%d, MYRANK=%d, HOSTNAME=%s\n",
++              mpi_num_nodes,mpi_my_rank,mpi_hostname);
++  }
 +#endif
 +  
 +  *nnodes=mpi_num_nodes;
 +  
 +  return mpi_my_rank;
 +#endif
 +}
 +
 +int  gmx_node_num(void)
 +{
 +#ifndef GMX_MPI
 +  return 1;
 +#else
 +  int i;
 +  (void) MPI_Comm_size(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
 +int gmx_node_rank(void)
 +{
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  int i;
 +  (void) MPI_Comm_rank(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
 +
 +int gmx_hostname_num()
 +{
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +#ifdef GMX_THREAD_MPI
 +  /* thread-MPI currently puts the thread number in the process name,
 +   * we might want to change this, as this is inconsistent with what
 +   * most MPI implementations would do when running on a single node.
 +   */
 +  return 0;
 +#else
 +  int  resultlen,hostnum,i,j;
 +  char mpi_hostname[MPI_MAX_PROCESSOR_NAME],hostnum_str[MPI_MAX_PROCESSOR_NAME];
 +
 +  MPI_Get_processor_name(mpi_hostname,&resultlen);
 +  /* This procedure can only differentiate nodes with host names
 +   * that end on unique numbers.
 +   */
 +  i = 0;
 +  j = 0;
 +  /* Only parse the host name up to the first dot */
 +  while(i < resultlen && mpi_hostname[i] != '.') {
 +    if (isdigit(mpi_hostname[i])) {
 +      hostnum_str[j++] = mpi_hostname[i];
 +    }
 +    i++;
 +  }
 +  hostnum_str[j] = '\0';
 +  if (j == 0) {
 +    hostnum = 0;
 +  } else {
 +    /* Use only the last 9 decimals, so we don't overflow an int */
 +    hostnum = strtol(hostnum_str + max(0,j-9), NULL, 10);
 +  }
 +
 +  if (debug) {
 +    fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
 +        mpi_hostname,hostnum);
 +  }
 +  return hostnum;
 +#endif
 +#endif
 +}
 +
 +void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 +{
 +    gmx_nodecomm_t *nc;
 +    int  n,rank,hostnum,ng,ni;
 +
 +    /* Many MPI implementations do not optimize MPI_Allreduce
 +     * (and probably also other global communication calls)
 +     * for multi-core nodes connected by a network.
 +     * We can optimize such communication by using one MPI call
 +     * within each node and one between the nodes.
 +     * For MVAPICH2 and Intel MPI this reduces the time for
 +     * the global_stat communication by 25%
 +     * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
 +     * B. Hess, November 2007
 +     */
 +
 +    nc = &cr->nc;
 +
 +    nc->bUse = FALSE;
 +#ifndef GMX_THREAD_MPI
 +#ifdef GMX_MPI
 +    MPI_Comm_size(cr->mpi_comm_mygroup,&n);
 +    MPI_Comm_rank(cr->mpi_comm_mygroup,&rank);
 +
 +    hostnum = gmx_hostname_num();
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: splitting communicator of size %d\n",n);
 +    }
 +
 +
 +    /* The intra-node communicator, split on node number */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,hostnum,rank,&nc->comm_intra);
 +    MPI_Comm_rank(nc->comm_intra,&nc->rank_intra);
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: node rank %d rank_intra %d\n",
 +                rank,nc->rank_intra);
 +    }
 +    /* The inter-node communicator, split on rank_intra.
 +     * We actually only need the one for rank=0,
 +     * but it is easier to create them all.
 +     */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,nc->rank_intra,rank,&nc->comm_inter);
 +    /* Check if this really created two step communication */
 +    MPI_Comm_size(nc->comm_inter,&ng);
 +    MPI_Comm_size(nc->comm_intra,&ni);
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: groups %d, my group size %d\n",
 +                ng,ni);
 +    }
 +
 +    if (getenv("GMX_NO_NODECOMM") == NULL &&
 +        ((ng > 1 && ng < n) || (ni > 1 && ni < n)))
 +    {
 +        nc->bUse = TRUE;
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using two step summing over %d groups of on average %.1f processes\n\n",
 +                    ng,(real)n/(real)ng);
 +        }
 +        if (nc->rank_intra > 0)
 +        {
 +            MPI_Comm_free(&nc->comm_inter);
 +        }
 +    }
 +    else
 +    {
 +        /* One group or all processes in a separate group, use normal summing */
 +        MPI_Comm_free(&nc->comm_inter);
 +        MPI_Comm_free(&nc->comm_intra);
 +        if (debug)
 +        {
 +            fprintf(debug,"In gmx_setup_nodecomm: not unsing separate inter- and intra-node communicators.\n");
 +        }
 +    }
 +#endif
 +#else
 +    /* tMPI runs only on a single node so just use the nodeid */
 +    nc->rank_intra = cr->nodeid;
 +#endif
 +}
 +
 +void gmx_init_intranode_counters(t_commrec *cr)
 +{
 +    /* counters for PP+PME and PP-only processes on my physical node */
 +    int nrank_intranode, rank_intranode;
 +    int nrank_pp_intranode, rank_pp_intranode;
 +    /* thread-MPI is not initialized when not running in parallel */
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    int nrank_world, rank_world;
 +    int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
 +
 +    MPI_Comm_size(MPI_COMM_WORLD,&nrank_world);
 +    MPI_Comm_rank(MPI_COMM_WORLD,&rank_world);
 +
 +    /* Get the node number from the hostname to identify the nodes */
 +    mynum = gmx_hostname_num();
 +
 +    /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
 +    snew(num,   nrank_world);
 +    snew(num_s, nrank_world);
 +    snew(num_pp,   nrank_world);
 +    snew(num_pp_s, nrank_world);
 +
 +    num_s[rank_world]    = mynum;
 +    num_pp_s[rank_world] = (cr->duty & DUTY_PP) ? mynum : -1;
 +
 +    MPI_Allreduce(num_s,    num,    nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 +    MPI_Allreduce(num_pp_s, num_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 +
 +    nrank_intranode    = 0;
 +    rank_intranode     = 0;
 +    nrank_pp_intranode = 0;
 +    rank_pp_intranode  = 0;
 +    for(i=0; i<nrank_world; i++)
 +    {
 +        if (num[i] == mynum)
 +        {
 +            nrank_intranode++;
 +            if (i < rank_world)
 +            {
 +                rank_intranode++;
 +            }
 +        }
 +        if ((cr->duty & DUTY_PP) && num_pp[i] == mynum)
 +        {
 +            nrank_pp_intranode++;
 +            if (i < rank_world)
 +            {
 +                rank_pp_intranode++;
 +            }
 +        }
 +    }
 +    sfree(num);
 +    sfree(num_s);
 +    sfree(num_pp);
 +    sfree(num_pp_s);
 +#else
 +    /* Serial or thread-MPI code: we run within a single physical node */
 +    nrank_intranode    = cr->nnodes;
 +    rank_intranode     = cr->sim_nodeid;
 +    nrank_pp_intranode = cr->nnodes - cr->npmenodes;
 +    rank_pp_intranode  = cr->nodeid;
 +#endif
 +
 +    if (debug)
 +    {
 +        char sbuf[STRLEN];
 +        if (cr->duty & DUTY_PP && cr->duty & DUTY_PME)
 +        {
 +            sprintf(sbuf, "PP+PME");
 +        }
 +        else
 +        {
 +            sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
 +        }
 +        fprintf(debug, "On %3s node %d: nrank_intranode=%d, rank_intranode=%d, "
 +                "nrank_pp_intranode=%d, rank_pp_intranode=%d\n",
 +                sbuf, cr->sim_nodeid,
 +                nrank_intranode, rank_intranode,
 +                nrank_pp_intranode, rank_pp_intranode);
 +    }
 +
 +    cr->nrank_intranode    = nrank_intranode;
 +    cr->rank_intranode     = rank_intranode;
 +    cr->nrank_pp_intranode = nrank_pp_intranode;
 +    cr->rank_pp_intranode  = rank_pp_intranode;
 +}
 +
 +
 +void gmx_barrier(const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_barrier");
 +#else
 +  MPI_Barrier(cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_abort(int noderank,int nnodes,int errorno)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_abort");
 +#else
 +#ifdef GMX_THREAD_MPI
 +  fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  thanx(stderr);
 +  exit(1);
 +#else
 +  if (nnodes > 1)
 +  {
 +      fprintf(stderr,"Halting parallel program %s on CPU %d out of %d\n",
 +              ShortProgram(),noderank,nnodes);
 +  }
 +  else
 +  {
 +      fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  }
 +
 +  thanx(stderr);
 +  MPI_Abort(MPI_COMM_WORLD,errorno);
 +  exit(1);
 +#endif
 +#endif
 +}
 +
 +void gmx_bcast(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_bcast_sim(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mysim);
 +#endif
 +}
 +
 +void gmx_sumd(int nr,double r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumd");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            /* Use two step summing. */
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers. */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_DOUBLE,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                      cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->dbuf_alloc) {
 +        cr->mpb->dbuf_alloc = nr;
 +        srenew(cr->mpb->dbuf,cr->mpb->dbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->dbuf,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->dbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumf(int nr,float r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumf");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing.  */
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_FLOAT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->fbuf_alloc) {
 +        cr->mpb->fbuf_alloc = nr;
 +        srenew(cr->mpb->fbuf,cr->mpb->fbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->fbuf,r,nr,MPI_FLOAT,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->fbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumi(int nr,int r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->ibuf_alloc) {
 +        cr->mpb->ibuf_alloc = nr;
 +        srenew(cr->mpb->ibuf,cr->mpb->ibuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->ibuf,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->ibuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli(int nr,gmx_large_int_t r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->libuf_alloc) {
 +        cr->mpb->libuf_alloc = nr;
 +        srenew(cr->mpb->libuf,cr->mpb->libuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->libuf,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->libuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +
 +
 +#ifdef GMX_MPI
 +void gmx_sumd_comm(int nr,double r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    double *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +#ifdef GMX_MPI
 +void gmx_sumf_comm(int nr,float r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    float *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumd_sim");
 +#else
 +  gmx_sumd_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumf_sim");
 +#else
 +  gmx_sumf_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumi_sim(int nr,int r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->ibuf_alloc) {
 +        ms->mpb->ibuf_alloc = nr;
 +        srenew(ms->mpb->ibuf,ms->mpb->ibuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->ibuf,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->ibuf[i];
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli_sim(int nr,gmx_large_int_t r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->libuf_alloc) {
 +        ms->mpb->libuf_alloc = nr;
 +        srenew(ms->mpb->libuf,ms->mpb->libuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->libuf[i];
 +#endif
 +#endif
 +}
 +
 +
 +void gmx_finalize_par(void)
 +{
 +#ifndef GMX_MPI
 +    /* Compiled without MPI, no MPI finalizing needed */
 +    return;
 +#else
 +    int initialized,finalized;
 +    int ret;
 +
 +    MPI_Initialized(&initialized);
 +    if (!initialized)
 +    {
 +        return;
 +    }
 +    /* just as a check; we don't want to finalize twice */
 +    MPI_Finalized(&finalized);
 +    if (finalized)
 +    {
 +      return;
 +    }
 +
 +  /* We sync the processes here to try to avoid problems
 +   * with buggy MPI implementations that could cause
 +   * unfinished processes to terminate.
 +   */
 +  MPI_Barrier(MPI_COMM_WORLD);
 +
 +  /*
 +  if (DOMAINDECOMP(cr)) {
 +    if (cr->npmenodes > 0 || cr->dd->bCartesian) 
 +      MPI_Comm_free(&cr->mpi_comm_mygroup);
 +    if (cr->dd->bCartesian)
 +      MPI_Comm_free(&cr->mpi_comm_mysim);
 +  }
 +  */
 +
 +  /* Apparently certain mpich implementations cause problems
 +   * with MPI_Finalize. In that case comment out MPI_Finalize.
 +   */
 +  if (debug)
 +    fprintf(debug,"Will call MPI_Finalize now\n");
 +
 +  ret = MPI_Finalize();
 +  if (debug)
 +    fprintf(debug,"Return code from MPI_Finalize = %d\n",ret);
 +#endif
 +}
 +
index 8e3b8debc185ccf527120223eb9809391cbcb9e0,0000000000000000000000000000000000000000..c6255e34cda8fb667d76a4524f96231a299e40b9
mode 100644,000000..100644
--- /dev/null
@@@ -1,843 -1,0 +1,845 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#include "kernelutil_x86_avx_128_fma_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_pd(rt);
 +#else
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_pd(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 95 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_pd(rt);
 +#else
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_pd(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 95 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*95);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_pd(rt);
 +#else
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_pd(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 85 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_pd(rt);
 +#else
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_pd(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 85 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*85);
 +}
index ea36c2a7293495c4dfcc7ff5d0721232a22aad4e,0000000000000000000000000000000000000000..200a6f56fe33c21c639e42ae3bd01722dee3c15c
mode 100644,000000..100644
--- /dev/null
@@@ -1,721 -1,0 +1,723 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#include "kernelutil_x86_avx_128_fma_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 74 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 74 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*74);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 67 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 67 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*67);
 +}
index 4d4396f10550fa995b332ebab214ae3321faba99,0000000000000000000000000000000000000000..db102833b3eae06e5f8b41b4debd224fcfdcb41e
mode 100644,000000..100644
--- /dev/null
@@@ -1,650 -1,0 +1,652 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#include "kernelutil_x86_avx_128_fma_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 61 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 61 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*61);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 59 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r00,vgb));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv00,fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_macc_pd(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
 +            
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx00,fscal),
 +                                                   _mm_mul_pd(dy00,fscal),
 +                                                   _mm_mul_pd(dz00,fscal));
 +
 +            /* Inner loop uses 59 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*59);
 +}
index 43f7f24fd8fb531c7126c9f935e1ac888bdec843,0000000000000000000000000000000000000000..6405a27a2210f604384b7e20b5dcff69dad8a54b
mode 100644,000000..100644
--- /dev/null
@@@ -1,1068 -1,0 +1,1071 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#include "kernelutil_x86_avx_128_fma_double.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B;
 +    __m128d          jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128d          dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_pd(fr->ic->k_rf);
 +    krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_pd(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_pd(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_pd(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_pd(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_pd(d_scalar);
 +    swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_pd();
 +        fiy{I}             = _mm_setzero_pd();
 +        fiz{I}             = _mm_setzero_pd();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_pd(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_pd();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_pd();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* #if ROUND =='Loop' */
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #else */
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #endif */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_pd(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_pd(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_pd(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_pd(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_pd(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_pd(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_pd(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            /*         #if ROUND =='Loop' */
 +            jq{J}              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+{J},charge+jnrB+{J});
 +            /*         #else */
 +            jq{J}              = _mm_load_sd(charge+jnrA+{J});
 +            /*         #endif */
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if ROUND =='Loop' */
 +            isaj{J}            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
 +            /*             #else */
 +            isaj{J}            = _mm_load_sd(invsqrta+jnrA+{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            /*         #if ROUND =='Loop' */
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            /*         #endif */
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_pd();
 +            fjy{J}             = _mm_setzero_pd();
 +            fjz{J}             = _mm_setzero_pd();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_pd(rsq{I}{J},rinv{I}{J});
 +             /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
 +            /*             #else */
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_pd(rt);
 +#else
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_pd(vfeps,vfeps);
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 data per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 data per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 data per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_macc_pd(krf,rsq{I}{J},rinv{I}{J}),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(qq{I}{J},_mm_msub_pd(rinv{I}{J},rinvsq{I}{J},krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq{I}{J},_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_pd(rt);
 +#else
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( gbtab + _mm_extract_epi32(gbitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(gbeps,_mm_macc_pd(gbeps,H,G),F);
 +            VV               = _mm_macc_pd(gbeps,Fp,Y);
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            twogbeps         = _mm_add_pd(gbeps,gbeps);
 +            FF               = _mm_macc_pd(_mm_macc_pd(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_macc_pd(fgb,r{I}{J},vgb));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
++            /*                 #endif */
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #else */
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #endif */
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_msub_pd(velec,rinv{I}{J},fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm_mul_pd(r{I}{J},ewtabscale);
 +            ewitab           = _mm_cvttpd_epi32(ewrt);
 +#ifdef __XOP__
 +            eweps            = _mm_frcz_pd(ewrt);
 +#else
 +            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
 +#endif
 +            twoeweps         = _mm_add_pd(eweps,eweps);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
 +            /*                 #else */
 +            ewtabD           = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
 +            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
 +            /*                 #else */
 +            ewtabFn          = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
 +            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */            
 +            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_sub_pd(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
 +                                         &ewtabF,&ewtabFn);
 +            /*                 #else */
 +            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
 +            /*                 #endif */
 +            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            velec            = _mm_mul_pd(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
 +            felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq{I}{J},FF),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
 +                                           _mm_mul_pd(_mm_nmacc_pd( c6_{I}{J},sh_vdw_invrcut6,vvdw6),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_pd(_mm_msub_pd(c12_{I}{J},rinvsix,c6_{I}{J}),_mm_mul_pd(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_pd(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_pd(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_pd(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_pd(r{I}{J},rswitch);
 +            d                = _mm_max_pd(d,_mm_setzero_pd());
 +            d2               = _mm_mul_pd(d,d);
 +            sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_pd(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_pd(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            cutoff_mask      = _mm_cmplt_pd(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_pd(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            /*             #endif */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_pd(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            /*             #endif */
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            /*             #endif */
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_pd(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_and_pd(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +            /*             #endif */
 +
 +            /* ## Construction of vectorial force built into FMA instructions now */
 +            /* #define INNERFLOPS INNERFLOPS+3      */
 +            
 +            /* Update vectorial force */
 +            fix{I}             = _mm_macc_pd(dx{I}{J},fscal,fix{I});
 +            fiy{I}             = _mm_macc_pd(dy{I}{J},fscal,fiy{I});
 +            fiz{I}             = _mm_macc_pd(dz{I}{J},fscal,fiz{I});
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +            
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
 +                                                   _mm_mul_pd(dx{I}{J},fscal),
 +                                                   _mm_mul_pd(dy{I}{J},fscal),
 +                                                   _mm_mul_pd(dz{I}{J},fscal));
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
 +                                                   _mm_mul_pd(dx{I}{J},fscal),
 +                                                   _mm_mul_pd(dy{I}{J},fscal),
 +                                                   _mm_mul_pd(dz{I}{J},fscal));
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_macc_pd(dx{I}{J},fscal,fjx{J});
 +            fjy{J}             = _mm_macc_pd(dy{I}{J},fscal,fjy{J});
 +            fjz{J}             = _mm_macc_pd(dz{I}{J},fscal,fjz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+12 */
 +            /*     #else                                */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+9  */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai{I},isai{I}));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index d2eb10eb9cdb31339bd18725888fe228682651c8,0000000000000000000000000000000000000000..bddbca0d1855559e23d7848e48f330443fa959d1
mode 100644,000000..100644
--- /dev/null
@@@ -1,963 -1,0 +1,965 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#include "kernelutil_x86_avx_128_fma_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_ps(rt);
 +#else
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_ps(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 95 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_ps(rt);
 +#else
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_ps(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 96 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*96);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_ps(rt);
 +#else
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_ps(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 85 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_ps(rt);
 +#else
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_ps(vfeps,vfeps);
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 86 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*86);
 +}
index a0def42c3d188147def2514db8f6e8a351a168a6,0000000000000000000000000000000000000000..fc0ff76f4e5097b7712034f78baff0c3f76ea958
mode 100644,000000..100644
--- /dev/null
@@@ -1,849 -1,0 +1,851 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#include "kernelutil_x86_avx_128_fma_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 74 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 75 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*75);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 67 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 68 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*68);
 +}
index 696a9dde11469c63a204a60e358fcb126a3e1cb4,0000000000000000000000000000000000000000..2470c88c5a5c778919bddac4cb57b3c80606c6a2
mode 100644,000000..100644
--- /dev/null
@@@ -1,754 -1,0 +1,756 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#include "kernelutil_x86_avx_128_fma_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 61 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 62 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*62);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_128_fma_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 59 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r00,vgb));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv00,fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +             /* Update vectorial force */
 +            fix0             = _mm_macc_ps(dx00,fscal,fix0);
 +            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 +            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx00,fscal),
 +                                                   _mm_mul_ps(dy00,fscal),
 +                                                   _mm_mul_ps(dz00,fscal));
 +
 +            /* Inner loop uses 60 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*60);
 +}
index 24d7dcc3d5ec046917bd2bc2dbc049f6ce3fbd38,0000000000000000000000000000000000000000..e99699cbbffcff16655224e87b5aeefc399212d3
mode 100644,000000..100644
--- /dev/null
@@@ -1,1027 -1,0 +1,1030 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#include "kernelutil_x86_avx_128_fma_single.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
 +    __m128           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,twogbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_ps(fr->ic->k_rf);
 +    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_ps(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
 +    beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
 +    beta2            = _mm_mul_ps(beta,beta);
 +    beta3            = _mm_mul_ps(beta,beta2);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_ps(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_ps(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_ps(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_ps(d_scalar);
 +    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_ps();
 +        fiy{I}             = _mm_setzero_ps();
 +        fiz{I}             = _mm_setzero_ps();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_ps(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_ps();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_ps();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_ps(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_ps(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_ps(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_ps(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_ps(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
 +                                                              charge+jnrC+{J},charge+jnrD+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                              invsqrta+jnrC+{J},invsqrta+jnrD+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_ps();
 +            fjy{J}             = _mm_setzero_ps();
 +            fjz{J}             = _mm_setzero_ps();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm_andnot_ps(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}C,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}D,
 +                                         &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            vfeps            = _mm_frcz_ps(rt);
 +#else
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            twovfeps         = _mm_add_ps(vfeps,vfeps);
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_macc_ps(krf,rsq{I}{J},rinv{I}{J}),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(qq{I}{J},_mm_msub_ps(rinv{I}{J},rinvsq{I}{J},krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +#ifdef __XOP__
 +            gbeps            = _mm_frcz_ps(rt);
 +#else
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +#endif
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + _mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(gbeps,_mm_macc_ps(gbeps,H,G),F);
 +            VV               = _mm_macc_ps(gbeps,Fp,Y);
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            twogbeps         = _mm_add_ps(gbeps,gbeps);
 +            FF               = _mm_macc_ps(_mm_macc_ps(twogbeps,H,G),gbeps,Fp);
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_macc_ps(fgb,r{I}{J},vgb));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
++            /*                 #endif */
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            /*                 #endif */
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(_mm_msub_ps(velec,rinv{I}{J},fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Analytical PME correction */
 +            zeta2            = _mm_mul_ps(beta2,rsq{I}{J});
 +            /*             #if 'Force' in KERNEL_VF */
 +            rinv3            = _mm_mul_ps(rinvsq{I}{J},rinv{I}{J});
 +            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 +            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 +            felec            = _mm_mul_ps(qq{I}{J},felec);
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv{I}{J},sh_ewald));
 +            /*                 #else */
 +            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv{I}{J});
 +            /*                 #endif */
 +            velec            = _mm_mul_ps(qq{I}{J},velec);
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            velec            = _mm_mul_ps(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
 +                                          _mm_mul_ps( _mm_nmacc_ps(c6_{I}{J},sh_vdw_invrcut6,vvdw6),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_ps(_mm_msub_ps(c12_{I}{J},rinvsix,c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw6            = _mm_mul_ps(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_macc_ps(vfeps,Fp,Y);
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 +            fvdw12           = _mm_mul_ps(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_ps(r{I}{J},rswitch);
 +            d                = _mm_max_ps(d,_mm_setzero_ps());
 +            d2               = _mm_mul_ps(d,d);
 +            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_ps(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_ps(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_ps(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_ps(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_ps(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm_and_ps(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* ## Construction of vectorial force built into FMA instructions now */
 +            /* #define INNERFLOPS INNERFLOPS+3      */
 +
 +             /* Update vectorial force */
 +            fix{I}             = _mm_macc_ps(dx{I}{J},fscal,fix{I});
 +            fiy{I}             = _mm_macc_ps(dy{I}{J},fscal,fiy{I});
 +            fiz{I}             = _mm_macc_ps(dz{I}{J},fscal,fiz{I});
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   _mm_mul_ps(dx{I}{J},fscal),
 +                                                   _mm_mul_ps(dy{I}{J},fscal),
 +                                                   _mm_mul_ps(dz{I}{J},fscal));
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_macc_ps(dx{I}{J},fscal,fjx{J});
 +            fjy{J}             = _mm_macc_ps(dy{I}{J},fscal,fjy{J});
 +            fjz{J}             = _mm_macc_ps(dz{I}{J},fscal,fjz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 +            /* #elif GEOMETRY_J == 'Water3'               */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index 3ab68aaedb747e5e5ea77473180ea329aa93a606,0000000000000000000000000000000000000000..8188a7c0d95315a13b42890eb4159d355b5a7ded
mode 100644,000000..100644
--- /dev/null
@@@ -1,957 -1,0 +1,959 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        vvdwsum          = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 91 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_00,VV);
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 81 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r00,vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_00,FF);
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*82);
 +}
index abb078218264047f347d7ae3230a47d36d410667,0000000000000000000000000000000000000000..9edb654bddf411c67fef4e81b81692ab21941775
mode 100644,000000..100644
--- /dev/null
@@@ -1,855 -1,0 +1,857 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        vvdwsum          = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 70 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 63 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
 +}
index 27c0bb96b2ece012f609ed02305ac24fcadfe699,0000000000000000000000000000000000000000..40a10c2a31b68e6df0e6f62714d968bed522866a
mode 100644,000000..100644
--- /dev/null
@@@ -1,760 -1,0 +1,762 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_pd();
 +        vgbsum           = _mm256_setzero_pd();
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 57 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_pd();
 +        fiy0             = _mm256_setzero_pd();
 +        fiz0             = _mm256_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
 +        isai0            = _mm256_set1_pd(invsqrta[inr+0]);
 +
 +        dvdasum          = _mm256_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 55 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_pd(ix0,jx0);
 +            dy00             = _mm256_sub_pd(iy0,jy0);
 +            dz00             = _mm256_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_pd(rsq00,rinv00);
 +            r00              = _mm256_andnot_pd(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq00,_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r00,gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj0,isaj0)));
 +            velec            = _mm256_mul_pd(qq00,rinv00);
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx00);
 +            ty               = _mm256_mul_pd(fscal,dy00);
 +            tz               = _mm256_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_pd(fix0,tx);
 +            fiy0             = _mm256_add_pd(fiy0,ty);
 +            fiz0             = _mm256_add_pd(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai0,isai0));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*56);
 +}
index 9f36946353161cecea954211e171a9aae4b784f2,0000000000000000000000000000000000000000..2f0e86719395d773f98f8e170853deca4475dff1
mode 100644,000000..100644
--- /dev/null
@@@ -1,1047 -1,0 +1,1050 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_double.h"
 +#include "kernelutil_x86_avx_256_double.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    real *           vdwioffsetptr{I};
 +    __m256d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
 +    __m256d          jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m256d          dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256d          minushalf = _mm256_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 +    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m256d          dummy_mask,cutoff_mask;
 +    __m128           tmpmask0,tmpmask1;
 +    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 +    __m256d          one     = _mm256_set1_pd(1.0);
 +    __m256d          two     = _mm256_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm256_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm256_set1_pd(fr->ic->k_rf);
 +    krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
 +    crf              = _mm256_set1_pd(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
 +    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff);
 +    beta2            = _mm256_mul_pd(beta,beta);
 +    beta3            = _mm256_mul_pd(beta,beta2);
 +
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm256_set1_pd(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm256_mul_pd(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
 +    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
 +    rvdw             = _mm256_set1_pd(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm256_set1_pd(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm256_set1_pd(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm256_set1_pd(d_scalar);
 +    swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                    &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm256_setzero_pd();
 +        fiy{I}             = _mm256_setzero_pd();
 +        fiz{I}             = _mm256_setzero_pd();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+{I}]));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm256_set1_pd(invsqrta[inr+{I}]);
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm256_setzero_pd();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm256_setzero_pd();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm256_setzero_pd();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm256_setzero_pd();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
 +             */
 +            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +
 +            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
 +            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
 +            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
 +
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                                 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                                 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm256_sub_pd(ix{I},jx{J});
 +            dy{I}{J}             = _mm256_sub_pd(iy{I},jy{J});
 +            dz{I}{J}             = _mm256_sub_pd(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm256_calc_rsq_pd(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm256_invsqrt_pd(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm256_inv_pd(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm256_mul_pd(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+{J},charge+jnrB+{J},
 +                                                                 charge+jnrC+{J},charge+jnrD+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm256_load_4real_swizzle_pd(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                                 invsqrta+jnrC+{J},invsqrta+jnrD+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm256_setzero_pd();
 +            fjy{J}             = _mm256_setzero_pd();
 +            fjz{J}             = _mm256_setzero_pd();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm256_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm256_mul_pd(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm256_andnot_pd(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm256_mul_pd(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr{I}+vdwjidx{J}A,
 +                                            vdwioffsetptr{I}+vdwjidx{J}B,
 +                                            vdwioffsetptr{I}+vdwjidx{J}C,
 +                                            vdwioffsetptr{I}+vdwjidx{J}D,
 +                                            &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_pd(r{I}{J},vftabscale);
 +            vfitab           = _mm256_cvttpd_epi32(rt);
 +            vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm256_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_add_pd(rinv{I}{J},_mm256_mul_pd(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_mul_pd(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_pd(isai{I},isaj{J});
 +            gbqqfactor       = _mm256_xor_pd(signbit,_mm256_mul_pd(qq{I}{J},_mm256_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_pd(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_pd(r{I}{J},gbscale);
 +            gbitab           = _mm256_cvttpd_epi32(rt);
 +            gbeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,0) );
 +            F                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,1) );
 +            G                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,2) );
 +            H                = _mm256_load_pd( gbtab + _mm_extract_epi32(gbitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(gbeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(gbeps,_mm256_add_pd(G,Heps)));
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(gbeps,Fp));
 +            vgb              = _mm256_mul_pd(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(gbeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fgb              = _mm256_mul_pd(gbqqfactor,_mm256_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm256_mul_pd(minushalf,_mm256_add_pd(vgb,_mm256_mul_pd(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
++            /*                 #endif */
 +            dvdasum          = _mm256_add_pd(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            /*                 #endif */
 +            gmx_mm256_increment_4real_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                 _mm256_mul_pd(dvdatmp,_mm256_mul_pd(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+12 */
 +            /*             #endif */
 +            velec            = _mm256_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm256_mul_pd(r{I}{J},ewtabscale);
 +            ewitab           = _mm256_cvttpd_epi32(ewrt);
 +            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
 +            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
 +            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
 +            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
 +            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(_mm256_sub_pd(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
 +            velec            = _mm256_mul_pd(qq{I}{J},_mm256_sub_pd(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_pd(_mm256_mul_pd(qq{I}{J},rinv{I}{J}),_mm256_sub_pd(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
 +                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
 +                                            &ewtabF,&ewtabFn);
 +            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
 +            felec            = _mm256_mul_pd(_mm256_mul_pd(qq{I}{J},rinv{I}{J}),_mm256_sub_pd(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            velec            = _mm256_mul_pd(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq{I}{J},FF),_mm256_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm256_mul_pd(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_{I}{J},rinvsix),c6_{I}{J}),_mm256_mul_pd(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm256_mul_pd(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw6            = _mm256_mul_pd(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
 +            F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
 +            G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
 +            H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
 +            GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
 +            Heps             = _mm256_mul_pd(vfeps,H);
 +            Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm256_mul_pd(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
 +            fvdw12           = _mm256_mul_pd(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm256_add_pd(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm256_sub_pd(r{I}{J},rswitch);
 +            d                = _mm256_max_pd(d,_mm256_setzero_pd());
 +            d2               = _mm256_mul_pd(d,d);
 +            sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv{I}{J},_mm256_mul_pd(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv{I}{J},_mm256_mul_pd(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm256_mul_pd(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm256_mul_pd(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm256_cmp_pd(rsq{I}{J},rcutoff2,_CMP_LT_OQ);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm256_and_pd(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm256_andnot_pd(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm256_add_pd(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm256_and_pd(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm256_andnot_pd(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm256_add_pd(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm256_add_pd(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm256_and_pd(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_pd(fscal,dx{I}{J});
 +            ty               = _mm256_mul_pd(fscal,dy{I}{J});
 +            tz               = _mm256_mul_pd(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm256_add_pd(fix{I},tx);
 +            fiy{I}             = _mm256_add_pd(fiy{I},ty);
 +            fiz{I}             = _mm256_add_pd(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm256_add_pd(fjx{J},tx);
 +            fjy{J}             = _mm256_add_pd(fjy{J},ty);
 +            fjz{J}             = _mm256_add_pd(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                      fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm256_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm256_mul_pd(dvdasum, _mm256_mul_pd(isai{I},isai{I}));
 +        gmx_mm256_update_1pot_pd(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index 26bc33c0a493fac6ee22095376bda604cec774d4,0000000000000000000000000000000000000000..2156c94d180c8b0d4d363ae6da19b394f8143f7f
mode 100644,000000..100644
--- /dev/null
@@@ -1,1177 -1,0 +1,1179 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_single.h"
 +#include "kernelutil_x86_avx_256_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 +    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_ps();
 +        vgbsum           = _mm256_setzero_ps();
 +        vvdwsum          = _mm256_setzero_ps();
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_ps(r00,vftabscale);
 +            vfitab           = _mm256_cvttps_epi32(rt);
 +            vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 +            vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm256_mul_ps(c6_00,VV);
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw6            = _mm256_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm256_mul_ps(c12_00,VV);
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw12           = _mm256_mul_ps(c12_00,FF);
 +            vvdw             = _mm256_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 91 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_ps(r00,vftabscale);
 +            vfitab           = _mm256_cvttps_epi32(rt);
 +            vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 +            vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm256_mul_ps(c6_00,VV);
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw6            = _mm256_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm256_mul_ps(c12_00,VV);
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw12           = _mm256_mul_ps(c12_00,FF);
 +            vvdw             = _mm256_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgb              = _mm256_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 +    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_ps(r00,vftabscale);
 +            vfitab           = _mm256_cvttps_epi32(rt);
 +            vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 +            vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw6            = _mm256_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw12           = _mm256_mul_ps(c12_00,FF);
 +            fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 81 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_ps(r00,vftabscale);
 +            vfitab           = _mm256_cvttps_epi32(rt);
 +            vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 +            vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw6            = _mm256_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw12           = _mm256_mul_ps(c12_00,FF);
 +            fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*82);
 +}
index f626f719200621bf6b1e032012b1f269bb4e617b,0000000000000000000000000000000000000000..464ff1a4fa98df3fb02d40cb02381d04a32357fb
mode 100644,000000..100644
--- /dev/null
@@@ -1,1023 -1,0 +1,1025 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_single.h"
 +#include "kernelutil_x86_avx_256_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 +    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_ps();
 +        vgbsum           = _mm256_setzero_ps();
 +        vvdwsum          = _mm256_setzero_ps();
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 70 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgb              = _mm256_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 +    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 63 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +            vdwjidx0E        = 2*vdwtype[jnrE+0];
 +            vdwjidx0F        = 2*vdwtype[jnrF+0];
 +            vdwjidx0G        = 2*vdwtype[jnrG+0];
 +            vdwjidx0H        = 2*vdwtype[jnrH+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
 +                                            vdwioffsetptr0+vdwjidx0B,
 +                                            vdwioffsetptr0+vdwjidx0C,
 +                                            vdwioffsetptr0+vdwjidx0D,
 +                                            vdwioffsetptr0+vdwjidx0E,
 +                                            vdwioffsetptr0+vdwjidx0F,
 +                                            vdwioffsetptr0+vdwjidx0G,
 +                                            vdwioffsetptr0+vdwjidx0H,
 +                                            &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
 +}
index f8bb1ffbcd249fa2e56b5771fe748e114c4881b4,0000000000000000000000000000000000000000..2f7d64e61c5a21705c94c3442faf8f140b483c6c
mode 100644,000000..100644
--- /dev/null
@@@ -1,896 -1,0 +1,898 @@@
 +/*
 + * Note: this file was generated by the Gromacs avx_256_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_single.h"
 +#include "kernelutil_x86_avx_256_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm256_setzero_ps();
 +        vgbsum           = _mm256_setzero_ps();
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 57 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm256_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            vgb              = _mm256_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm256_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_avx_256_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    real *           vdwioffsetptr0;
 +    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
 +    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm256_setzero_ps();
 +        fiy0             = _mm256_setzero_ps();
 +        fiz0             = _mm256_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 +        isai0            = _mm256_set1_ps(invsqrta[inr+0]);
 +
 +        dvdasum          = _mm256_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 55 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm256_sub_ps(ix0,jx0);
 +            dy00             = _mm256_sub_ps(iy0,jy0);
 +            dz00             = _mm256_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                                 charge+jnrC+0,charge+jnrD+0,
 +                                                                 charge+jnrE+0,charge+jnrF+0,
 +                                                                 charge+jnrG+0,charge+jnrH+0);
 +            isaj0            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                                 invsqrta+jnrC+0,invsqrta+jnrD+0,
 +                                                                 invsqrta+jnrE+0,invsqrta+jnrF+0,
 +                                                                 invsqrta+jnrG+0,invsqrta+jnrH+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm256_mul_ps(rsq00,rinv00);
 +            r00              = _mm256_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm256_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq00,_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r00,gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r00)));
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj0,isaj0)));
 +            velec            = _mm256_mul_ps(qq00,rinv00);
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx00);
 +            ty               = _mm256_mul_ps(fscal,dy00);
 +            tz               = _mm256_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm256_add_ps(fix0,tx);
 +            fiy0             = _mm256_add_ps(fiy0,ty);
 +            fiz0             = _mm256_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai0,isai0));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*56);
 +}
index 89bfdd3514646bab46276d15bdaa493ffd873f85,0000000000000000000000000000000000000000..6cc310d76db7abded10073d5b6ed350b963c9a60
mode 100644,000000..100644
--- /dev/null
@@@ -1,1127 -1,0 +1,1130 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_avx_256_single.h"
 +#include "kernelutil_x86_avx_256_single.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrE,jnrF,jnrG,jnrH;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
 +    real             scratch[4*DIM];
 +    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    real *           vdwioffsetptr{I};
 +    __m256           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D,vdwjidx{J}E,vdwjidx{J}F,vdwjidx{J}G,vdwjidx{J}H;
 +    __m256           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m256           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m256i          gbitab;
 +    __m128i          gbitab_lo,gbitab_hi;
 +    __m256           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m256           minushalf = _mm256_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 +    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m256i          vfitab;
 +    __m128i          vfitab_lo,vfitab_hi;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m256i          ewitab;
 +    __m128i          ewitab_lo,ewitab_hi;
 +    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m256           dummy_mask,cutoff_mask;
 +    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 +    __m256           one     = _mm256_set1_ps(1.0);
 +    __m256           two     = _mm256_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm256_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm256_set1_ps(fr->ic->k_rf);
 +    krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
 +    crf              = _mm256_set1_ps(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
 +    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
 +    beta2            = _mm256_mul_ps(beta,beta);
 +    beta3            = _mm256_mul_ps(beta,beta2);
 +
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm256_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm256_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm256_set1_ps(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm256_mul_ps(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
 +    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
 +    rvdw             = _mm256_set1_ps(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm256_set1_ps(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm256_set1_ps(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm256_set1_ps(d_scalar);
 +    swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +    j_coord_offsetE = 0;
 +    j_coord_offsetF = 0;
 +    j_coord_offsetG = 0;
 +    j_coord_offsetH = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                    &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm256_setzero_ps();
 +        fiy{I}             = _mm256_setzero_ps();
 +        fiz{I}             = _mm256_setzero_ps();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+{I}]));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm256_set1_ps(invsqrta[inr+{I}]);
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm256_setzero_ps();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm256_setzero_ps();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm256_setzero_ps();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm256_setzero_ps();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            jnrE             = jjnr[jidx+4];
 +            jnrF             = jjnr[jidx+5];
 +            jnrG             = jjnr[jidx+6];
 +            jnrH             = jjnr[jidx+7];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            jnrlistE         = jjnr[jidx+4];
 +            jnrlistF         = jjnr[jidx+5];
 +            jnrlistG         = jjnr[jidx+6];
 +            jnrlistH         = jjnr[jidx+7];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 +                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 +                                            
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 +            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 +            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 +            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +            j_coord_offsetE  = DIM*jnrE;
 +            j_coord_offsetF  = DIM*jnrF;
 +            j_coord_offsetG  = DIM*jnrG;
 +            j_coord_offsetH  = DIM*jnrH;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                                 x+j_coord_offsetC,x+j_coord_offsetD,
 +                                                 x+j_coord_offsetE,x+j_coord_offsetF,
 +                                                 x+j_coord_offsetG,x+j_coord_offsetH,
 +                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                                 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                                 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
 +                                                 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
 +                                                 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm256_sub_ps(ix{I},jx{J});
 +            dy{I}{J}             = _mm256_sub_ps(iy{I},jy{J});
 +            dz{I}{J}             = _mm256_sub_ps(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm256_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm256_invsqrt_ps(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm256_inv_ps(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm256_mul_ps(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
 +                                                                 charge+jnrC+{J},charge+jnrD+{J},
 +                                                                 charge+jnrE+{J},charge+jnrF+{J},
 +                                                                 charge+jnrG+{J},charge+jnrH+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm256_load_8real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                                 invsqrta+jnrC+{J},invsqrta+jnrD+{J},
 +                                                                 invsqrta+jnrE+{J},invsqrta+jnrF+{J},
 +                                                                 invsqrta+jnrG+{J},invsqrta+jnrH+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            vdwjidx{J}E        = 2*vdwtype[jnrE+{J}];
 +            vdwjidx{J}F        = 2*vdwtype[jnrF+{J}];
 +            vdwjidx{J}G        = 2*vdwtype[jnrG+{J}];
 +            vdwjidx{J}H        = 2*vdwtype[jnrH+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm256_setzero_ps();
 +            fjy{J}             = _mm256_setzero_ps();
 +            fjz{J}             = _mm256_setzero_ps();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm256_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm256_mul_ps(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm256_andnot_ps(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm256_mul_ps(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr{I}+vdwjidx{J}A,
 +                                            vdwioffsetptr{I}+vdwjidx{J}B,
 +                                            vdwioffsetptr{I}+vdwjidx{J}C,
 +                                            vdwioffsetptr{I}+vdwjidx{J}D,
 +                                            vdwioffsetptr{I}+vdwjidx{J}E,
 +                                            vdwioffsetptr{I}+vdwjidx{J}F,
 +                                            vdwioffsetptr{I}+vdwjidx{J}G,
 +                                            vdwioffsetptr{I}+vdwjidx{J}H,
 +                                            &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm256_mul_ps(r{I}{J},vftabscale);
 +            vfitab           = _mm256_cvttps_epi32(rt);
 +            vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 +            vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
 +            vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 +            vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm256_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_ps(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm256_mul_ps(qq{I}{J},_mm256_sub_ps(_mm256_add_ps(rinv{I}{J},_mm256_mul_ps(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_ps(qq{I}{J},_mm256_sub_ps(_mm256_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm256_mul_ps(isai{I},isaj{J});
 +            gbqqfactor       = _mm256_xor_ps(signbit,_mm256_mul_ps(qq{I}{J},_mm256_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm256_mul_ps(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm256_mul_ps(r{I}{J},gbscale);
 +            gbitab           = _mm256_cvttps_epi32(rt);
 +            gbeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 +            gbitab_lo        = _mm256_extractf128_si256(gbitab,0x0);
 +            gbitab_hi        = _mm256_extractf128_si256(gbitab,0x1);
 +            gbitab_lo        = _mm_slli_epi32(gbitab_lo,2);
 +            gbitab_hi        = _mm_slli_epi32(gbitab_hi,2);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,0)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,1)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,2)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(gbtab + _mm_extract_epi32(gbitab_hi,3)),
 +                                                  _mm_load_ps(gbtab + _mm_extract_epi32(gbitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(gbeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(gbeps,_mm256_add_ps(G,Heps)));
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(gbeps,Fp));
 +            vgb              = _mm256_mul_ps(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(gbeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fgb              = _mm256_mul_ps(gbqqfactor,_mm256_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm256_mul_ps(minushalf,_mm256_add_ps(vgb,_mm256_mul_ps(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm256_andnot_ps(dummy_mask,dvdatmp);
++            /*                 #endif */
 +            dvdasum          = _mm256_add_ps(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            fjptrE           = dvda+jnrE;
 +            fjptrF           = dvda+jnrF;
 +            fjptrG           = dvda+jnrG;
 +            fjptrH           = dvda+jnrH;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? dvda+jnrE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? dvda+jnrF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? dvda+jnrG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? dvda+jnrH : scratch;
 +            /*                 #endif */
 +            gmx_mm256_increment_8real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                 _mm256_mul_ps(dvdatmp,_mm256_mul_ps(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+12 */
 +            /*             #endif */
 +            velec            = _mm256_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +            
 +            /* Analytical PME correction */
 +            zeta2            = _mm256_mul_ps(beta2,rsq{I}{J});
 +            /*             #if 'Force' in KERNEL_VF */
 +            rinv3            = _mm256_mul_ps(rinvsq{I}{J},rinv{I}{J});
 +            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
 +            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
 +            felec            = _mm256_mul_ps(qq{I}{J},felec);
 +            /*                 #define INNERFLOPS INNERFLOPS+31 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
 +            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
 +            /*                 #define INNERFLOPS INNERFLOPS+27       */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv{I}{J},sh_ewald),pmecorrV);
 +            /*                     #define INNERFLOPS INNERFLOPS+21 */
 +            /*                 #else */
 +            velec            = _mm256_sub_ps(rinv{I}{J},pmecorrV);
 +            /*                 #endif */
 +            velec            = _mm256_mul_ps(qq{I}{J},velec);
 +            /*             #endif */
 +            
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            velec            = _mm256_mul_ps(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq{I}{J},FF),_mm256_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm256_mul_ps(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm256_mul_ps(c12_{I}{J},_mm256_mul_ps(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_{I}{J},_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm256_mul_ps(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            /*             #endif                     */
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm256_mul_ps(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw6            = _mm256_mul_ps(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 +            vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 +            Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 +            F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 +            G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 +            H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 +                                                  _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 +            GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm256_mul_ps(vfeps,H);
 +            Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm256_mul_ps(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 +            fvdw12           = _mm256_mul_ps(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm256_add_ps(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm256_sub_ps(r{I}{J},rswitch);
 +            d                = _mm256_max_ps(d,_mm256_setzero_ps());
 +            d2               = _mm256_mul_ps(d,d);
 +            sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv{I}{J},_mm256_mul_ps(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv{I}{J},_mm256_mul_ps(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm256_mul_ps(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm256_mul_ps(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm256_cmp_ps(rsq{I}{J},rcutoff2,_CMP_LT_OQ);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm256_and_ps(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm256_andnot_ps(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm256_add_ps(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm256_and_ps(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm256_andnot_ps(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm256_add_ps(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm256_add_ps(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm256_and_ps(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm256_mul_ps(fscal,dx{I}{J});
 +            ty               = _mm256_mul_ps(fscal,dy{I}{J});
 +            tz               = _mm256_mul_ps(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm256_add_ps(fix{I},tx);
 +            fiy{I}             = _mm256_add_ps(fiy{I},ty);
 +            fiz{I}             = _mm256_add_ps(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            /*     #endif */
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm256_add_ps(fjx{J},tx);
 +            fjy{J}             = _mm256_add_ps(fjy{J},ty);
 +            fjz{J}             = _mm256_add_ps(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            fjptrE             = f+j_coord_offsetE;
 +            fjptrF             = f+j_coord_offsetF;
 +            fjptrG             = f+j_coord_offsetG;
 +            fjptrH             = f+j_coord_offsetH;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
 +            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
 +            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
 +            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +          /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 +                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                      fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
 +                                                      fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                                 f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm256_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm256_mul_ps(dvdasum, _mm256_mul_ps(isai{I},isai{I}));
 +        gmx_mm256_update_1pot_ps(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index 23ba2990df4d02737f0ab28b357671bdefa8fe8d,0000000000000000000000000000000000000000..bddc5873d4eba690cd3369d5cf54c22f8c7312f0
mode 100644,000000..100644
--- /dev/null
@@@ -1,487 -1,0 +1,490 @@@
 +/*
 + * Note: this file was generated by the Gromacs c kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_c
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    int              i_shift_offset,i_coord_offset,j_coord_offset;
 +    int              j_index_start,j_index_end;
 +    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
 +    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             *shiftvec,*fshift,*x,*f;
 +    int              vdwioffset0;
 +    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0;
 +    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
 +    real             velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    int              gbitab;
 +    real             vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    int              vfitab;
 +    real             rt,vfeps,vftabscale,Y,F,Geps,Heps2,Fp,VV,FF;
 +    real             *vftab;
 +
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = fr->epsfac;
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = fr->gbtab.scale;
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = (1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent);
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +        shX              = shiftvec[i_shift_offset+XX];
 +        shY              = shiftvec[i_shift_offset+YY];
 +        shZ              = shiftvec[i_shift_offset+ZZ];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        ix0              = shX + x[i_coord_offset+DIM*0+XX];
 +        iy0              = shY + x[i_coord_offset+DIM*0+YY];
 +        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
 +
 +        fix0             = 0.0;
 +        fiy0             = 0.0;
 +        fiz0             = 0.0;
 +
 +        /* Load parameters for i particles */
 +        iq0              = facel*charge[inr+0];
 +        isai0            = invsqrta[inr+0];
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = 0.0;
 +        vgbsum           = 0.0;
 +        vvdwsum          = 0.0;
 +        dvdasum          = 0.0;
++        printf("inr=%d\n",inr);
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end; jidx++)
 +        {
 +            /* Get j neighbor index, and coordinate index */
 +            jnr              = jjnr[jidx];
 +            j_coord_offset   = DIM*jnr;
 +
 +            /* load j atom coordinates */
 +            jx0              = x[j_coord_offset+DIM*0+XX];
 +            jy0              = x[j_coord_offset+DIM*0+YY];
 +            jz0              = x[j_coord_offset+DIM*0+ZZ];
 +
 +            /* Calculate displacement vector */
 +            dx00             = ix0 - jx0;
 +            dy00             = iy0 - jy0;
 +            dz00             = iz0 - jz0;
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
 +
 +            rinv00           = gmx_invsqrt(rsq00);
 +
 +            rinvsq00         = rinv00*rinv00;
 +
 +            /* Load parameters for j particles */
 +            jq0              = charge[jnr+0];
 +            isaj0           = invsqrta[jnr+0];
 +            vdwjidx0         = 2*vdwtype[jnr+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = rsq00*rinv00;
 +
 +            qq00             = iq0*jq0;
 +            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
 +            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = isai0*isaj0;
 +            gbqqfactor       = isaprod*(-qq00)*gbinvepsdiff;
 +            gbscale          = isaprod*gbtabscale;
 +            dvdaj            = dvda[jnr+0];
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = r00*gbscale;
 +            gbitab           = rt;
 +            gbeps            = rt-gbitab;
 +            gbitab           = 4*gbitab;
 +
 +            Y                = gbtab[gbitab];
 +            F                = gbtab[gbitab+1];
 +            Geps             = gbeps*gbtab[gbitab+2];
 +            Heps2            = gbeps*gbeps*gbtab[gbitab+3];
 +            Fp               = F+Geps+Heps2;
 +            VV               = Y+gbeps*Fp;
 +            vgb              = gbqqfactor*VV;
 +
 +            FF               = Fp+Geps+2.0*Heps2;
 +            fgb              = gbqqfactor*FF*gbscale;
++            printf("  jnr=%d  fgb=%g\n",jnr,fgb);
 +            dvdatmp          = -0.5*(vgb+fgb*r00);
 +            dvdasum          = dvdasum + dvdatmp;
++            printf("  dvdatmp=%g\n",dvdatmp);
 +            dvda[jnr]        = dvdaj+dvdatmp*isaj0*isaj0;
++            printf("  dvda, jcontrib=%g\n",dvdatmp*isaj0*isaj0);
 +            velec            = qq00*rinv00;
 +            felec            = (velec*rinv00-fgb)*rinv00;
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
 +            vvdw6            = c6_00*rinvsix;
 +            vvdw12           = c12_00*rinvsix*rinvsix;
 +            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
 +            fvdw             = (vvdw12-vvdw6)*rinvsq00;
 +
 +            /* Update potential sums from outer loop */
 +            velecsum        += velec;
 +            vgbsum          += vgb;
 +            vvdwsum         += vvdw;
 +
 +            fscal            = felec+fvdw;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = fscal*dx00;
 +            ty               = fscal*dy00;
 +            tz               = fscal*dz00;
 +
 +            /* Update vectorial force */
 +            fix0            += tx;
 +            fiy0            += ty;
 +            fiz0            += tz;
 +            f[j_coord_offset+DIM*0+XX] -= tx;
 +            f[j_coord_offset+DIM*0+YY] -= ty;
 +            f[j_coord_offset+DIM*0+ZZ] -= tz;
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +        /* End of innermost loop */
 +
 +        tx = ty = tz = 0;
 +        f[i_coord_offset+DIM*0+XX] += fix0;
 +        f[i_coord_offset+DIM*0+YY] += fiy0;
 +        f[i_coord_offset+DIM*0+ZZ] += fiz0;
 +        tx                         += fix0;
 +        ty                         += fiy0;
 +        tz                         += fiz0;
 +        fshift[i_shift_offset+XX]  += tx;
 +        fshift[i_shift_offset+YY]  += ty;
 +        fshift[i_shift_offset+ZZ]  += tz;
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        kernel_data->energygrp_elec[ggid] += velecsum;
 +        kernel_data->energygrp_polarization[ggid] += vgbsum;
 +        kernel_data->energygrp_vdw[ggid] += vvdwsum;
 +        dvda[inr]                   = dvda[inr] + dvdasum*isai0*isai0;
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 16 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*16 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_c
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_c
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    int              i_shift_offset,i_coord_offset,j_coord_offset;
 +    int              j_index_start,j_index_end;
 +    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
 +    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             *shiftvec,*fshift,*x,*f;
 +    int              vdwioffset0;
 +    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0;
 +    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
 +    real             velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    int              gbitab;
 +    real             vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    int              vfitab;
 +    real             rt,vfeps,vftabscale,Y,F,Geps,Heps2,Fp,VV,FF;
 +    real             *vftab;
 +
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = fr->epsfac;
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = fr->gbtab.scale;
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = (1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent);
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +        shX              = shiftvec[i_shift_offset+XX];
 +        shY              = shiftvec[i_shift_offset+YY];
 +        shZ              = shiftvec[i_shift_offset+ZZ];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        ix0              = shX + x[i_coord_offset+DIM*0+XX];
 +        iy0              = shY + x[i_coord_offset+DIM*0+YY];
 +        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
 +
 +        fix0             = 0.0;
 +        fiy0             = 0.0;
 +        fiz0             = 0.0;
 +
 +        /* Load parameters for i particles */
 +        iq0              = facel*charge[inr+0];
 +        isai0            = invsqrta[inr+0];
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = 0.0;
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end; jidx++)
 +        {
 +            /* Get j neighbor index, and coordinate index */
 +            jnr              = jjnr[jidx];
 +            j_coord_offset   = DIM*jnr;
 +
 +            /* load j atom coordinates */
 +            jx0              = x[j_coord_offset+DIM*0+XX];
 +            jy0              = x[j_coord_offset+DIM*0+YY];
 +            jz0              = x[j_coord_offset+DIM*0+ZZ];
 +
 +            /* Calculate displacement vector */
 +            dx00             = ix0 - jx0;
 +            dy00             = iy0 - jy0;
 +            dz00             = iz0 - jz0;
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
 +
 +            rinv00           = gmx_invsqrt(rsq00);
 +
 +            rinvsq00         = rinv00*rinv00;
 +
 +            /* Load parameters for j particles */
 +            jq0              = charge[jnr+0];
 +            isaj0           = invsqrta[jnr+0];
 +            vdwjidx0         = 2*vdwtype[jnr+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = rsq00*rinv00;
 +
 +            qq00             = iq0*jq0;
 +            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
 +            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = isai0*isaj0;
 +            gbqqfactor       = isaprod*(-qq00)*gbinvepsdiff;
 +            gbscale          = isaprod*gbtabscale;
 +            dvdaj            = dvda[jnr+0];
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = r00*gbscale;
 +            gbitab           = rt;
 +            gbeps            = rt-gbitab;
 +            gbitab           = 4*gbitab;
 +
 +            Y                = gbtab[gbitab];
 +            F                = gbtab[gbitab+1];
 +            Geps             = gbeps*gbtab[gbitab+2];
 +            Heps2            = gbeps*gbeps*gbtab[gbitab+3];
 +            Fp               = F+Geps+Heps2;
 +            VV               = Y+gbeps*Fp;
 +            vgb              = gbqqfactor*VV;
 +
 +            FF               = Fp+Geps+2.0*Heps2;
 +            fgb              = gbqqfactor*FF*gbscale;
 +            dvdatmp          = -0.5*(vgb+fgb*r00);
 +            dvdasum          = dvdasum + dvdatmp;
 +            dvda[jnr]        = dvdaj+dvdatmp*isaj0*isaj0;
 +            velec            = qq00*rinv00;
 +            felec            = (velec*rinv00-fgb)*rinv00;
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
 +            fvdw             = (c12_00*rinvsix-c6_00)*rinvsix*rinvsq00;
 +
 +            fscal            = felec+fvdw;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = fscal*dx00;
 +            ty               = fscal*dy00;
 +            tz               = fscal*dz00;
 +
 +            /* Update vectorial force */
 +            fix0            += tx;
 +            fiy0            += ty;
 +            fiz0            += tz;
 +            f[j_coord_offset+DIM*0+XX] -= tx;
 +            f[j_coord_offset+DIM*0+YY] -= ty;
 +            f[j_coord_offset+DIM*0+ZZ] -= tz;
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +        /* End of innermost loop */
 +
 +        tx = ty = tz = 0;
 +        f[i_coord_offset+DIM*0+XX] += fix0;
 +        f[i_coord_offset+DIM*0+YY] += fiy0;
 +        f[i_coord_offset+DIM*0+ZZ] += fiz0;
 +        tx                         += fix0;
 +        ty                         += fiy0;
 +        tz                         += fiz0;
 +        fshift[i_shift_offset+XX]  += tx;
 +        fshift[i_shift_offset+YY]  += ty;
 +        fshift[i_shift_offset+ZZ]  += tz;
 +
 +        dvda[inr]                   = dvda[inr] + dvdasum*isai0*isai0;
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 13 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*13 + inneriter*64);
 +}
index a18d1c18495a1a7b68f3c5fc70515194a566fa7c,0000000000000000000000000000000000000000..d0aa93072ab6a152315ae762952e87eb3a7d6015
mode 100644,000000..100644
--- /dev/null
@@@ -1,823 -1,0 +1,825 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_double.h"
 +#include "kernelutil_x86_sse2_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*82);
 +}
index 2c6fa3646815f80c71012154651ce91538231577,0000000000000000000000000000000000000000..b34776616d53d6b270779fca0858e31ce097c05c
mode 100644,000000..100644
--- /dev/null
@@@ -1,713 -1,0 +1,715 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_double.h"
 +#include "kernelutil_x86_sse2_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
 +}
index 830d40a2f0a6ad2383af15ab0f30a8cb116b0741,0000000000000000000000000000000000000000..a5845eedd8a4a132b0c4de15bfe41defbb317241
mode 100644,000000..100644
--- /dev/null
@@@ -1,642 -1,0 +1,644 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_double.h"
 +#include "kernelutil_x86_sse2_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*56);
 +}
index 4bf5952cbaa89e27750c5046f8b3261c94753ed8,0000000000000000000000000000000000000000..e9fa63ae175c140635c2d3fe5fee836e5634899a
mode 100644,000000..100644
--- /dev/null
@@@ -1,1053 -1,0 +1,1056 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_double.h"
 +#include "kernelutil_x86_sse2_double.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B;
 +    __m128d          jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128d          dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_pd(fr->ic->k_rf);
 +    krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_pd(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_pd(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_pd(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_pd(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_pd(d_scalar);
 +    swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_pd();
 +        fiy{I}             = _mm_setzero_pd();
 +        fiz{I}             = _mm_setzero_pd();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_pd(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_pd();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_pd();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* #if ROUND =='Loop' */
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #else */
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #endif */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_pd(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_pd(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_pd(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_pd(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_pd(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_pd(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_pd(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            /*         #if ROUND =='Loop' */
 +            jq{J}              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+{J},charge+jnrB+{J});
 +            /*         #else */
 +            jq{J}              = _mm_load_sd(charge+jnrA+{J});
 +            /*         #endif */
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if ROUND =='Loop' */
 +            isaj{J}            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
 +            /*             #else */
 +            isaj{J}            = _mm_load_sd(invsqrta+jnrA+{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            /*         #if ROUND =='Loop' */
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            /*         #endif */
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_pd();
 +            fjy{J}             = _mm_setzero_pd();
 +            fjz{J}             = _mm_setzero_pd();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_pd(rsq{I}{J},rinv{I}{J});
 +             /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
 +            /*             #else */
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 data per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 data per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 data per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_add_pd(rinv{I}{J},_mm_mul_pd(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_mul_pd(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq{I}{J},_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
++            /*                 #endif */
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #else */
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #endif */
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm_mul_pd(r{I}{J},ewtabscale);
 +            ewitab           = _mm_cvttpd_epi32(ewrt);
 +            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
 +            /*                 #else */
 +            ewtabD           = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
 +            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
 +            /*                 #else */
 +            ewtabFn          = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
 +            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_sub_pd(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
 +                                         &ewtabF,&ewtabFn);
 +            /*                 #else */
 +            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
 +            /*                 #endif */
 +            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            velec            = _mm_mul_pd(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq{I}{J},FF),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_pd(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_pd(r{I}{J},rswitch);
 +            d                = _mm_max_pd(d,_mm_setzero_pd());
 +            d2               = _mm_mul_pd(d,d);
 +            sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_pd(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_pd(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            cutoff_mask      = _mm_cmplt_pd(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_pd(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            /*             #endif */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_pd(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            /*             #endif */
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            /*             #endif */
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_pd(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_and_pd(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx{I}{J});
 +            ty               = _mm_mul_pd(fscal,dy{I}{J});
 +            tz               = _mm_mul_pd(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm_add_pd(fix{I},tx);
 +            fiy{I}             = _mm_add_pd(fiy{I},ty);
 +            fiz{I}             = _mm_add_pd(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_add_pd(fjx{J},tx);
 +            fjy{J}             = _mm_add_pd(fjy{J},ty);
 +            fjz{J}             = _mm_add_pd(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+12 */
 +            /*     #else                                */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+9  */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai{I},isai{I}));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index 2354469b729abea5cdbb196b5f1c79a878d08395,0000000000000000000000000000000000000000..26bdb4cd3495a292e689505648600bf0f669dc01
mode 100644,000000..100644
--- /dev/null
@@@ -1,943 -1,0 +1,945 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_single.h"
 +#include "kernelutil_x86_sse2_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 93 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*93);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 83 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*83);
 +}
index 4b8297188884489b5c5e86fc747833e0d359a150,0000000000000000000000000000000000000000..b2c7e57c5ffd6dc20b31b5a6c8023bbf09a535b8
mode 100644,000000..100644
--- /dev/null
@@@ -1,841 -1,0 +1,843 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_single.h"
 +#include "kernelutil_x86_sse2_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 72 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*72);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 65 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
 +}
index cdc49ea118d005ee215e4a18c7c5e8df8c114c82,0000000000000000000000000000000000000000..dd490338e1381a2bbeda3add3de48b257e799643
mode 100644,000000..100644
--- /dev/null
@@@ -1,746 -1,0 +1,748 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse2_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_single.h"
 +#include "kernelutil_x86_sse2_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 59 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*59);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            
 +            /* Inner loop uses 57 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*57);
 +}
index edec7a374548afbae499dca7d10df43e3460f47f,0000000000000000000000000000000000000000..00ff0ca56afb01b3239295fba3114ee6143ee30f
mode 100644,000000..100644
--- /dev/null
@@@ -1,1034 -1,0 +1,1037 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse2_single.h"
 +#include "kernelutil_x86_sse2_single.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
 +    __m128           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_ps(fr->ic->k_rf);
 +    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_ps(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_ps(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_ps(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_ps(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_ps(d_scalar);
 +    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }  
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +        
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_ps();
 +        fiy{I}             = _mm_setzero_ps();
 +        fiz{I}             = _mm_setzero_ps();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_ps(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_ps();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_ps();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_ps(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_ps(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_ps(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_ps(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_ps(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
 +                                                              charge+jnrC+{J},charge+jnrD+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                              invsqrta+jnrC+{J},invsqrta+jnrD+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_ps();
 +            fjy{J}             = _mm_setzero_ps();
 +            fjz{J}             = _mm_setzero_ps();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm_andnot_ps(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}C,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}D,
 +                                         &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_add_ps(rinv{I}{J},_mm_mul_ps(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
++            /*                 #endif */
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            /*                 #endif */
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm_mul_ps(r{I}{J},ewtabscale);
 +            ewitab           = _mm_cvttps_epi32(ewrt);
 +            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
 +            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
 +            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
 +            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
 +            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
 +            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_sub_ps(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
 +                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
 +                                         &ewtabF,&ewtabFn);
 +            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
 +            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            velec            = _mm_mul_ps(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_ps(r{I}{J},rswitch);
 +            d                = _mm_max_ps(d,_mm_setzero_ps());
 +            d2               = _mm_mul_ps(d,d);
 +            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_ps(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_ps(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_ps(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_ps(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_ps(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*        ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*        #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm_and_ps(fscal,cutoff_mask);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx{I}{J});
 +            ty               = _mm_mul_ps(fscal,dy{I}{J});
 +            tz               = _mm_mul_ps(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm_add_ps(fix{I},tx);
 +            fiy{I}             = _mm_add_ps(fiy{I},ty);
 +            fiz{I}             = _mm_add_ps(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_add_ps(fjx{J},tx);
 +            fjy{J}             = _mm_add_ps(fjy{J},ty);
 +            fjz{J}             = _mm_add_ps(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +            
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index c3e898e2a9222f838c60556a2bfdfa5740db07fb,0000000000000000000000000000000000000000..ac0dee2a255ec4292787efe72ebf5c5f0ba99452
mode 100644,000000..100644
--- /dev/null
@@@ -1,823 -1,0 +1,825 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_double.h"
 +#include "kernelutil_x86_sse4_1_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_00,VV);
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r00,vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_00,FF);
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*82);
 +}
index 2e9e29e174b43456611613acd556c5db72f13164,0000000000000000000000000000000000000000..8f91a0222666a83fb9871220755abd95d1b65160
mode 100644,000000..100644
--- /dev/null
@@@ -1,713 -1,0 +1,715 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_double.h"
 +#include "kernelutil_x86_sse4_1_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        vvdwsum          = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_pd(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_pd(felec,fvdw);
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
 +}
index be01ba33b0e6fa3689a2b4895ca982d23c5bcdc9,0000000000000000000000000000000000000000..99d358b661093c506b5d4c5b4a3ac55e028c7299
mode 100644,000000..100644
--- /dev/null
@@@ -1,642 -1,0 +1,644 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_double.h"
 +#include "kernelutil_x86_sse4_1_double.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_pd();
 +        vgbsum           = _mm_setzero_pd();
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_double
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B;
 +    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_pd();
 +        fiy0             = _mm_setzero_pd();
 +        fiz0             = _mm_setzero_pd();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
 +        isai0            = _mm_load1_pd(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_pd();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
 +            isaj0            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+0,invsqrta+jnrB+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_pd(ix0,jx0);
 +            dy00             = _mm_sub_pd(iy0,jy0);
 +            dz00             = _mm_sub_pd(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_pd(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = _mm_load_sd(charge+jnrA+0);
 +            isaj0            = _mm_load_sd(invsqrta+jnrA+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_pd(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_pd(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq00,_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r00,gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            H                = _mm_setzero_pd();
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r00)));
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj0,isaj0)));
 +            velec            = _mm_mul_pd(qq00,rinv00);
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx00);
 +            ty               = _mm_mul_pd(fscal,dy00);
 +            tz               = _mm_mul_pd(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_pd(fix0,tx);
 +            fiy0             = _mm_add_pd(fiy0,ty);
 +            fiz0             = _mm_add_pd(fiz0,tz);
 +
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*56);
 +}
index 744783556837baef451265ddcdc7c6edf259a6b7,0000000000000000000000000000000000000000..82a6f96d78ef9ea2efa119beb3cf502b973676fe
mode 100644,000000..100644
--- /dev/null
@@@ -1,1053 -1,0 +1,1056 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_double.h"
 +#include "kernelutil_x86_sse4_1_double.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 +     * just 0 for non-waters.
 +     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB;
 +    int              j_coord_offsetA,j_coord_offsetB;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B;
 +    __m128d          jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128d          dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 +    __m128d          minushalf = _mm_set1_pd(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 +    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128d          dummy_mask,cutoff_mask;
 +    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 +    __m128d          one     = _mm_set1_pd(1.0);
 +    __m128d          two     = _mm_set1_pd(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_pd(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_pd(fr->ic->k_rf);
 +    krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_pd(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_pd(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_pd(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_pd(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_pd(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_pd(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_pd(d_scalar);
 +    swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_pd();
 +        fiy{I}             = _mm_setzero_pd();
 +        fiz{I}             = _mm_setzero_pd();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_pd(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_pd();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_pd();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_pd();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* #if ROUND =='Loop' */
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #else */
 +            jnrA             = jjnr[jidx];
 +            j_coord_offsetA  = DIM*jnrA;
 +
 +            /* load j atom coordinates */
 +            /*     #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0);
 +            /*     #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /*     #elif GEOMETRY_J == 'Water4'             */
 +            /*         #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #else                                */
 +            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*         #endif                               */
 +            /*     #endif                                   */
 +            /* #endif */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_pd(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_pd(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_pd(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_pd(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_pd(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_pd(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_pd(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            /*         #if ROUND =='Loop' */
 +            jq{J}              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+{J},charge+jnrB+{J});
 +            /*         #else */
 +            jq{J}              = _mm_load_sd(charge+jnrA+{J});
 +            /*         #endif */
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if ROUND =='Loop' */
 +            isaj{J}            = gmx_mm_load_2real_swizzle_pd(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
 +            /*             #else */
 +            isaj{J}            = _mm_load_sd(invsqrta+jnrA+{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            /*         #if ROUND =='Loop' */
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            /*         #endif */
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_pd();
 +            fjy{J}             = _mm_setzero_pd();
 +            fjz{J}             = _mm_setzero_pd();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_pd(rsq{I}{J},rinv{I}{J});
 +             /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
 +            /*             #else */
 +            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_pd(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttpd_epi32(rt);
 +            vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 data per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 data per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 data per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_add_pd(rinv{I}{J},_mm_mul_pd(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_mul_pd(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_pd(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_pd(signbit,_mm_mul_pd(qq{I}{J},_mm_mul_pd(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_pd(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_pd(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttpd_epi32(rt);
 +            gbeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +
 +            Y                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( gbtab + gmx_mm_extract_epi32(gbitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(gbeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(gbeps,_mm_add_pd(G,Heps)));
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(gbeps,Fp));
 +            vgb              = _mm_mul_pd(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(gbeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fgb              = _mm_mul_pd(gbqqfactor,_mm_mul_pd(FF,gbscale));
 +            dvdatmp          = _mm_mul_pd(minushalf,_mm_add_pd(vgb,_mm_mul_pd(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_unpacklo_pd(dvdatmp,_mm_setzero_pd());
++            /*                 #endif */
 +            dvdasum          = _mm_add_pd(dvdasum,dvdatmp);
 +            /*             #if ROUND == 'Loop' */
 +            gmx_mm_increment_2real_swizzle_pd(dvda+jnrA,dvda+jnrB,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #else */
 +            gmx_mm_increment_1real_pd(dvda+jnrA,_mm_mul_pd(dvdatmp,_mm_mul_pd(isaj{J},isaj{J})));
 +            /*             #endif */
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_pd(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm_mul_pd(r{I}{J},ewtabscale);
 +            ewitab           = _mm_cvttpd_epi32(ewrt);
 +            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
 +            /*                 #else */
 +            ewtabD           = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
 +            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
 +            /*                 #if ROUND == 'Loop' */
 +            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
 +            /*                 #else */
 +            ewtabFn          = _mm_setzero_pd();
 +            /*                 #endif */
 +            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
 +            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(_mm_sub_pd(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
 +            velec            = _mm_mul_pd(qq{I}{J},_mm_sub_pd(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 #if ROUND == 'Loop' */
 +            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
 +                                         &ewtabF,&ewtabFn);
 +            /*                 #else */
 +            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
 +            /*                 #endif */
 +            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
 +            felec            = _mm_mul_pd(_mm_mul_pd(qq{I}{J},rinv{I}{J}),_mm_sub_pd(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            velec            = _mm_mul_pd(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq{I}{J},FF),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_pd(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw6            = _mm_mul_pd(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw6            = _mm_mul_pd(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            /*             #if ROUND == 'Loop' */
 +            F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            /*             #else */
 +            F                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(Y,F);
 +            G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
 +            /*             #if ROUND == 'Loop' */
 +            H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
 +            /*             #else */
 +            H                = _mm_setzero_pd();
 +            /*             #endif */
 +            GMX_MM_TRANSPOSE2_PD(G,H);
 +            Heps             = _mm_mul_pd(vfeps,H);
 +            Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
 +            vvdw12           = _mm_mul_pd(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
 +            fvdw12           = _mm_mul_pd(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_pd(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_pd(r{I}{J},rswitch);
 +            d                = _mm_max_pd(d,_mm_setzero_pd());
 +            d2               = _mm_mul_pd(d,d);
 +            sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv{I}{J},_mm_mul_pd(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_pd(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_pd(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            cutoff_mask      = _mm_cmplt_pd(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_pd(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
 +            /*             #endif */
 +            velecsum         = _mm_add_pd(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_pd(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_unpacklo_pd(vgb,_mm_setzero_pd());
 +            /*             #endif */
 +            vgbsum           = _mm_add_pd(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
 +            /*             #endif */
 +            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_pd(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_and_pd(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_pd(fscal,dx{I}{J});
 +            ty               = _mm_mul_pd(fscal,dy{I}{J});
 +            tz               = _mm_mul_pd(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm_add_pd(fix{I},tx);
 +            fiy{I}             = _mm_add_pd(fiy{I},ty);
 +            fiz{I}             = _mm_add_pd(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_add_pd(fjx{J},tx);
 +            fjy{J}             = _mm_add_pd(fjy{J},ty);
 +            fjz{J}             = _mm_add_pd(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
 +            /*     #else */
 +            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            /*     #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #endif */
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+12 */
 +            /*     #else                                */
 +            /*         #if ROUND == 'Loop' */
 +            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #else */
 +            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+9  */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai{I},isai{I}));
 +        gmx_mm_update_1pot_pd(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index 9d47405368111edc331dc0dc091dc9e07896e072,0000000000000000000000000000000000000000..81a6e97b776d8b2b4c85880385d4f319f00bbdc4
mode 100644,000000..100644
--- /dev/null
@@@ -1,939 -1,0 +1,941 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_single.h"
 +#include "kernelutil_x86_sse4_1_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 92 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_00,VV);
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 93 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*93);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            CubicSplineTable
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 82 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r00,vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_00,FF);
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_00,FF);
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 83 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*83);
 +}
index 4f80354d9156d5f60e112b8e1e4185bee90fc57e,0000000000000000000000000000000000000000..e7800d7733b49328c77d91b79c21089288b16e11
mode 100644,000000..100644
--- /dev/null
@@@ -1,837 -1,0 +1,839 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_single.h"
 +#include "kernelutil_x86_sse4_1_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        vvdwsum          = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 71 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 72 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 10 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*72);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            LennardJones
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 64 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +            vdwjidx0A        = 2*vdwtype[jnrA+0];
 +            vdwjidx0B        = 2*vdwtype[jnrB+0];
 +            vdwjidx0C        = 2*vdwtype[jnrC+0];
 +            vdwjidx0D        = 2*vdwtype[jnrD+0];
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 +                                         vdwparam+vdwioffset0+vdwjidx0B,
 +                                         vdwparam+vdwioffset0+vdwjidx0C,
 +                                         vdwparam+vdwioffset0+vdwjidx0D,
 +                                         &c6_00,&c12_00);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 +
 +            fscal            = _mm_add_ps(felec,fvdw);
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 65 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
 +}
index 1a98520afb02bfbcd2f0162d9f5def87b579b1c5,0000000000000000000000000000000000000000..049ba10ab972bf47caa151a1ab714a72fcfe3499
mode 100644,000000..100644
--- /dev/null
@@@ -1,742 -1,0 +1,744 @@@
 +/*
 + * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_single.h"
 +#include "kernelutil_x86_sse4_1_single.h"
 +
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        PotentialAndForce
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        /* Reset potential sums */
 +        velecsum         = _mm_setzero_ps();
 +        vgbsum           = _mm_setzero_ps();
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 58 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 59 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 9 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*59);
 +}
 +/*
 + * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
 + * Electrostatics interaction: GeneralizedBorn
 + * VdW interaction:            None
 + * Geometry:                   Particle-Particle
 + * Calculate force/pot:        Force
 + */
 +void
 +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    int              vdwioffset0;
 +    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 +    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 +    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 +    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +
 +        fix0             = _mm_setzero_ps();
 +        fiy0             = _mm_setzero_ps();
 +        fiz0             = _mm_setzero_ps();
 +
 +        /* Load parameters for i particles */
 +        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
 +        isai0            = _mm_load1_ps(invsqrta+inr+0);
 +
 +        dvdasum          = _mm_setzero_ps();
 +
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 56 flops */
 +        }
 +
 +        if(jidx<j_index_end)
 +        {
 +
 +            /* Get j neighbor index, and coordinate index */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +
 +            /* Calculate displacement vector */
 +            dx00             = _mm_sub_ps(ix0,jx0);
 +            dy00             = _mm_sub_ps(iy0,jy0);
 +            dz00             = _mm_sub_ps(iz0,jz0);
 +
 +            /* Calculate squared distance and things based on it */
 +            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 +
 +            rinv00           = gmx_mm_invsqrt_ps(rsq00);
 +
 +            /* Load parameters for j particles */
 +            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 +                                                              charge+jnrC+0,charge+jnrD+0);
 +            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
 +                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            r00              = _mm_mul_ps(rsq00,rinv00);
 +            r00              = _mm_andnot_ps(dummy_mask,r00);
 +
 +            /* Compute parameters for interactions between i and j atoms */
 +            qq00             = _mm_mul_ps(iq0,jq0);
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai0,isaj0);
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r00,gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
 +            velec            = _mm_mul_ps(qq00,rinv00);
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
 +
 +            fscal            = felec;
 +
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx00);
 +            ty               = _mm_mul_ps(fscal,dy00);
 +            tz               = _mm_mul_ps(fscal,dz00);
 +
 +            /* Update vectorial force */
 +            fix0             = _mm_add_ps(fix0,tx);
 +            fiy0             = _mm_add_ps(fiy0,ty);
 +            fiz0             = _mm_add_ps(fiz0,tz);
 +
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +
 +            /* Inner loop uses 57 flops */
 +        }
 +
 +        /* End of innermost loop */
 +
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses 7 flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*57);
 +}
index 9214f45ad8157e365ea8f91bdd1e2c72b6bf9ebe,0000000000000000000000000000000000000000..ee40d8317232e87c72b7c29b8794f4ad7fe2a005
mode 100644,000000..100644
--- /dev/null
@@@ -1,1033 -1,0 +1,1036 @@@
 +/* #if 0 */
 +#error This file must be processed with the Gromacs pre-preprocessor
 +/* #endif */
 +/* #if INCLUDE_HEADER */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "../nb_kernel.h"
 +#include "types/simple.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +
 +#include "gmx_math_x86_sse4_1_single.h"
 +#include "kernelutil_x86_sse4_1_single.h"
 +/* #endif */
 +
 +/* ## List of variables set by the generating script:                                    */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to the entire kernel:                                         */
 +/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
 +/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
 +/* ## KERNEL_NAME:           String, name of this kernel                                 */
 +/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
 +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
 +/* ##                                                                                    */
 +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
 +/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
 +/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
 +/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
 +/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
 +/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
 +/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
 +/* ##                                                                                    */
 +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
 +/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
 +/* ##                        should be calculated in this kernel. Zero-charge particles  */
 +/* ##                        do not have interactions with particles without vdw, and    */
 +/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
 +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
 +/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
 +/* ##                        defining properties/flags of this interaction. Examples     */
 +/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
 +/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
 +/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
 +/* ##                        decide if the force/potential should be modified. This way  */
 +/* ##                        we only calculate values absolutely needed for each case.   */
 +
 +/* ## Calculate the size and offset for (merged/interleaved) table data */
 +
 +/*
 + * Gromacs nonbonded kernel:   {KERNEL_NAME}
 + * Electrostatics interaction: {KERNEL_ELEC}
 + * VdW interaction:            {KERNEL_VDW}
 + * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
 + * Calculate force/pot:        {KERNEL_VF}
 + */
 +void
 +{KERNEL_NAME}
 +                    (t_nblist * gmx_restrict                nlist,
 +                     rvec * gmx_restrict                    xx,
 +                     rvec * gmx_restrict                    ff,
 +                     t_forcerec * gmx_restrict              fr,
 +                     t_mdatoms * gmx_restrict               mdatoms,
 +                     nb_kernel_data_t * gmx_restrict        kernel_data,
 +                     t_nrnb * gmx_restrict                  nrnb)
 +{
 +    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
 +    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
 +    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
 +     * just 0 for non-waters.
 +     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
 +     * jnr indices corresponding to data put in the four positions in the SIMD register.
 +     */
 +    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 +    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 +    int              jnrA,jnrB,jnrC,jnrD;
 +    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 +    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 +    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 +    real             rcutoff_scalar;
 +    real             *shiftvec,*fshift,*x,*f;
 +    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 +    real             scratch[4*DIM];
 +    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 +    /* #for I in PARTICLES_I */
 +    int              vdwioffset{I};
 +    __m128           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
 +    /* #endfor */
 +    /* #for J in PARTICLES_J */
 +    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
 +    __m128           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
 +    /* #endfor */
 +    /* #for I,J in PAIRS_IJ */
 +    __m128           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
 +    /* #endfor */
 +    /* #if KERNEL_ELEC != 'None' */
 +    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 +    real             *charge;
 +    /* #endif */
 +    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
 +    __m128i          gbitab;
 +    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 +    __m128           minushalf = _mm_set1_ps(-0.5);
 +    real             *invsqrta,*dvda,*gbtab;
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    int              nvdwtype;
 +    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 +    int              *vdwtype;
 +    real             *vdwparam;
 +    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 +    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 +    /* #endif */
 +    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
 +    __m128i          vfitab;
 +    __m128i          ifour       = _mm_set1_epi32(4);
 +    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 +    real             *vftab;
 +    /* #endif */
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    __m128i          ewitab;
 +    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 +    real             *ewtab;
 +    /* #endif */
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
 +    real             rswitch_scalar,d_scalar;
 +    /* #endif */
 +    __m128           dummy_mask,cutoff_mask;
 +    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    __m128           one     = _mm_set1_ps(1.0);
 +    __m128           two     = _mm_set1_ps(2.0);
 +    x                = xx[0];
 +    f                = ff[0];
 +
 +    nri              = nlist->nri;
 +    iinr             = nlist->iinr;
 +    jindex           = nlist->jindex;
 +    jjnr             = nlist->jjnr;
 +    shiftidx         = nlist->shift;
 +    gid              = nlist->gid;
 +    shiftvec         = fr->shift_vec[0];
 +    fshift           = fr->fshift[0];
 +    /* #if KERNEL_ELEC != 'None' */
 +    facel            = _mm_set1_ps(fr->epsfac);
 +    charge           = mdatoms->chargeA;
 +    /*     #if 'ReactionField' in KERNEL_ELEC */
 +    krf              = _mm_set1_ps(fr->ic->k_rf);
 +    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
 +    crf              = _mm_set1_ps(fr->ic->c_rf);
 +    /*     #endif */
 +    /* #endif */
 +    /* #if KERNEL_VDW != 'None' */
 +    nvdwtype         = fr->ntype;
 +    vdwparam         = fr->nbfp;
 +    vdwtype          = mdatoms->typeA;
 +    /* #endif */
 +
 +    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_elec_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
 +    /* #elif 'Table' in KERNEL_ELEC */
 +    vftab            = kernel_data->table_elec->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
 +    /* #elif 'Table' in KERNEL_VDW */
 +    vftab            = kernel_data->table_vdw->data;
 +    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 +    /* #endif */
 +
 +    /* #if 'Ewald' in KERNEL_ELEC */
 +    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
 +    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
 +    ewtab            = fr->ic->tabq_coul_F;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +    /*     #else */
 +    ewtab            = fr->ic->tabq_coul_FDV0;
 +    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 +    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 +     /*     #endif */
 +    /* #endif */
 +
 +    /* #if KERNEL_ELEC=='GeneralizedBorn' */
 +    invsqrta         = fr->invsqrta;
 +    dvda             = fr->dvda;
 +    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
 +    gbtab            = fr->gbtab.data;
 +    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_I */
 +    /* Setup water-specific parameters */
 +    inr              = nlist->iinr[0];
 +    /*     #for I in PARTICLES_ELEC_I */
 +    iq{I}              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
 +    /*     #endfor */
 +    /*     #for I in PARTICLES_VDW_I */
 +    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if 'Water' in GEOMETRY_J */
 +    /*     #for J in PARTICLES_ELEC_J */
 +    jq{J}              = _mm_set1_ps(charge[inr+{J}]);
 +    /*     #endfor */
 +    /*     #for J in PARTICLES_VDW_J */
 +    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
 +    /*     #endfor */
 +    /*     #for I,J in PAIRS_IJ */
 +    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +    qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +    /*         #endif */
 +    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
 +    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
 +    /*         #endif */
 +    /*     #endfor */
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
 +    /*     #if KERNEL_ELEC!='None' */
 +    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 +    rcutoff_scalar   = fr->rcoulomb;
 +    /*     #else */
 +    rcutoff_scalar   = fr->rvdw;
 +    /*     #endif */
 +    rcutoff          = _mm_set1_ps(rcutoff_scalar);
 +    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
 +    /* #endif */
 +
 +    /* #if KERNEL_MOD_VDW=='PotentialShift' */
 +    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
 +    rvdw             = _mm_set1_ps(fr->rvdw);
 +    /* #endif */
 +
 +    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
 +    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
 +    rswitch_scalar   = fr->rcoulomb_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #else */
 +    rswitch_scalar   = fr->rvdw_switch;
 +    rswitch          = _mm_set1_ps(rswitch_scalar);
 +    /*     #endif */
 +    /* Setup switch parameters */
 +    d_scalar         = rcutoff_scalar-rswitch_scalar;
 +    d                = _mm_set1_ps(d_scalar);
 +    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
 +    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #if 'Force' in KERNEL_VF */
 +    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
 +    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
 +    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
 +    /*     #endif */
 +    /* #endif */
 +
 +    /* Avoid stupid compiler warnings */
 +    jnrA = jnrB = jnrC = jnrD = 0;
 +    j_coord_offsetA = 0;
 +    j_coord_offsetB = 0;
 +    j_coord_offsetC = 0;
 +    j_coord_offsetD = 0;
 +
 +    /* ## Keep track of the floating point operations we issue for reporting! */
 +    /* #define OUTERFLOPS 0 */
 +    outeriter        = 0;
 +    inneriter        = 0;
 +
 +    for(iidx=0;iidx<4*DIM;iidx++)
 +    {
 +        scratch[iidx] = 0.0;
 +    }
 +
 +    /* Start outer loop over neighborlists */
 +    for(iidx=0; iidx<nri; iidx++)
 +    {
 +        /* Load shift vector for this list */
 +        i_shift_offset   = DIM*shiftidx[iidx];
 +
 +        /* Load limits for loop over neighbors */
 +        j_index_start    = jindex[iidx];
 +        j_index_end      = jindex[iidx+1];
 +
 +        /* Get outer coordinate index */
 +        inr              = iinr[iidx];
 +        i_coord_offset   = DIM*inr;
 +
 +        /* Load i particle coords and add shift vector */
 +        /* #if GEOMETRY_I == 'Particle' */
 +        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
 +        /* #elif GEOMETRY_I == 'Water3' */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 +        /* #elif GEOMETRY_I == 'Water4' */
 +        /*     #if 0 in PARTICLES_I                 */
 +        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 +                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #else                                */
 +        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
 +                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 +        /*     #endif                               */
 +        /* #endif                                   */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #for I in PARTICLES_I */
 +        fix{I}             = _mm_setzero_ps();
 +        fiy{I}             = _mm_setzero_ps();
 +        fiz{I}             = _mm_setzero_ps();
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* ## For water we already preloaded parameters at the start of the kernel */
 +        /* #if not 'Water' in GEOMETRY_I */
 +        /* Load parameters for i particles */
 +        /*     #for I in PARTICLES_ELEC_I */
 +        iq{I}              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +        isai{I}            = _mm_load1_ps(invsqrta+inr+{I});
 +        /*         #endif */
 +        /*     #endfor */
 +        /*     #for I in PARTICLES_VDW_I */
 +        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
 +        /*     #endfor */
 +        /* #endif */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        /* Reset potential sums */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        velecsum         = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        vgbsum           = _mm_setzero_ps();
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        vvdwsum          = _mm_setzero_ps();
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum          = _mm_setzero_ps();
 +        /*     #endif */
 +
 +        /* #for ROUND in ['Loop','Epilogue'] */
 +
 +        /* #if ROUND =='Loop' */
 +        /* Start inner kernel loop */
 +        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 +        {
 +        /* ## First round is normal loop (next statement resets indentation) */
 +        /*     #if 0 */
 +        }
 +        /*     #endif */
 +        /* #else */
 +        if(jidx<j_index_end)
 +        {
 +        /* ## Second round is epilogue */
 +        /* #endif */
 +        /* #define INNERFLOPS 0 */
 +
 +            /* Get j neighbor index, and coordinate index */
 +            /* #if ROUND =='Loop' */
 +            jnrA             = jjnr[jidx];
 +            jnrB             = jjnr[jidx+1];
 +            jnrC             = jjnr[jidx+2];
 +            jnrD             = jjnr[jidx+3];
 +            /* #else */
 +            jnrlistA         = jjnr[jidx];
 +            jnrlistB         = jjnr[jidx+1];
 +            jnrlistC         = jjnr[jidx+2];
 +            jnrlistD         = jjnr[jidx+3];
 +            /* Sign of each element will be negative for non-real atoms.
 +             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 +             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 +             */
 +            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 +            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 +            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 +            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 +            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 +            /* #endif */
 +            j_coord_offsetA  = DIM*jnrA;
 +            j_coord_offsetB  = DIM*jnrB;
 +            j_coord_offsetC  = DIM*jnrC;
 +            j_coord_offsetD  = DIM*jnrD;
 +
 +            /* load j atom coordinates */
 +            /* #if GEOMETRY_J == 'Particle'             */
 +            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0);
 +            /* #elif GEOMETRY_J == 'Water3'             */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 +                                              x+j_coord_offsetC,x+j_coord_offsetD,
 +                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 +                                              &jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #else                                */
 +            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
 +                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
 +                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Calculate displacement vector */
 +            /* #for I,J in PAIRS_IJ */
 +            dx{I}{J}             = _mm_sub_ps(ix{I},jx{J});
 +            dy{I}{J}             = _mm_sub_ps(iy{I},jy{J});
 +            dz{I}{J}             = _mm_sub_ps(iz{I},jz{J});
 +            /*     #define INNERFLOPS INNERFLOPS+3 */
 +            /* #endfor */
 +
 +            /* Calculate squared distance and things based on it */
 +            /* #for I,J in PAIRS_IJ */
 +            rsq{I}{J}            = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
 +            /*     #define INNERFLOPS INNERFLOPS+5 */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
 +            rinv{I}{J}           = gmx_mm_invsqrt_ps(rsq{I}{J});
 +            /*         #define INNERFLOPS INNERFLOPS+5 */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #for I,J in PAIRS_IJ */
 +            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
 +            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
 +            rinvsq{I}{J}         = gmx_mm_inv_ps(rsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*         #else */
 +            rinvsq{I}{J}         = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +            /* #endfor */
 +
 +            /* #if not 'Water' in GEOMETRY_J */
 +            /* Load parameters for j particles */
 +            /*     #for J in PARTICLES_ELEC_J */
 +            jq{J}              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
 +                                                              charge+jnrC+{J},charge+jnrD+{J});
 +            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
 +            isaj{J}            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
 +                                                              invsqrta+jnrC+{J},invsqrta+jnrD+{J});
 +            /*         #endif */
 +            /*     #endfor */
 +            /*     #for J in PARTICLES_VDW_J */
 +            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
 +            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
 +            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
 +            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
 +            /*     #for J in PARTICLES_J */
 +            fjx{J}             = _mm_setzero_ps();
 +            fjy{J}             = _mm_setzero_ps();
 +            fjz{J}             = _mm_setzero_ps();
 +            /*     #endfor */
 +            /* #endif */
 +
 +            /* #for I,J in PAIRS_IJ */
 +
 +            /**************************
 +             * CALCULATE INTERACTIONS *
 +             **************************/
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
 +            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
 +            {
 +                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
 +            }
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
 +            r{I}{J}              = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
 +            /*         #if ROUND == 'Epilogue' */
 +            r{I}{J}              = _mm_andnot_ps(dummy_mask,r{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     ## For water geometries we already loaded parameters at the start of the kernel */
 +            /*     #if not 'Water' in GEOMETRY_J */
 +            /* Compute parameters for interactions between i and j atoms */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}C,
 +                                         vdwparam+vdwioffset{I}+vdwjidx{J}D,
 +                                         &c6_{I}{J},&c12_{I}{J});
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
 +            /* Calculate table index by multiplying r with table scale and truncate to integer */
 +            rt               = _mm_mul_ps(r{I}{J},vftabscale);
 +            vfitab           = _mm_cvttps_epi32(rt);
 +            vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            /*         #define INNERFLOPS INNERFLOPS+4                          */
 +            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
 +            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
 +            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
 +            /*         #elif 'Table' in KERNEL_ELEC                             */
 +            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
 +            vfitab           = _mm_slli_epi32(vfitab,2);
 +            /*         #elif 'Table' in KERNEL_VDW                              */
 +            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
 +            vfitab           = _mm_slli_epi32(vfitab,3);
 +            /*         #endif                                                   */
 +            /*     #endif */
 +
 +            /*     ## ELECTROSTATIC INTERACTIONS */
 +            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_ELEC=='Coulomb' */
 +
 +            /* COULOMB ELECTROSTATICS */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(velec,rinvsq{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='ReactionField' */
 +
 +            /* REACTION-FIELD ELECTROSTATICS */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_add_ps(rinv{I}{J},_mm_mul_ps(krf,rsq{I}{J})),crf));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
 +
 +            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
 +            isaprod          = _mm_mul_ps(isai{I},isaj{J});
 +            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
 +            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +
 +            /* Calculate generalized born table index - this is a separate table from the normal one,
 +             * but we use the same procedure by multiplying r with scale and truncating to integer.
 +             */
 +            rt               = _mm_mul_ps(r{I}{J},gbscale);
 +            gbitab           = _mm_cvttps_epi32(rt);
 +            gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 +            gbitab           = _mm_slli_epi32(gbitab,2);
 +            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
 +            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
 +            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
 +            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(gbeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
 +            vgb              = _mm_mul_ps(gbqqfactor,VV);
 +            /*             #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
 +            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
++            /*                 #if ROUND == 'Epilogue' */
++            dvdatmp          = _mm_andnot_ps(dummy_mask,dvdatmp);
++            /*                 #endif */
 +            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
 +            /*                 #if ROUND == 'Loop' */
 +            fjptrA           = dvda+jnrA;
 +            fjptrB           = dvda+jnrB;
 +            fjptrC           = dvda+jnrC;
 +            fjptrD           = dvda+jnrD;
 +            /*                 #else */
 +            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
 +            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
 +            /*                 #endif */
 +            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+13 */
 +            /*             #endif */
 +            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='Ewald' */
 +            /* EWALD ELECTROSTATICS */
 +
 +            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
 +            ewrt             = _mm_mul_ps(r{I}{J},ewtabscale);
 +            ewitab           = _mm_cvttps_epi32(ewrt);
 +            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            ewitab           = _mm_slli_epi32(ewitab,2);
 +            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
 +            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
 +            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
 +            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
 +            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
 +            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
 +            /*                 #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
 +            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_sub_ps(rinv{I}{J},sh_ewald),velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+7 */
 +            /*                 #else */
 +            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
 +            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(rinv{I}{J},velec));
 +            /*                     #define INNERFLOPS INNERFLOPS+6 */
 +            /*                 #endif */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
 +            /*                      #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
 +                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
 +                                         &ewtabF,&ewtabFn);
 +            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
 +            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE ELECTROSTATICS */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            velec            = _mm_mul_ps(qq{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+7 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for electrostatics interaction forms */
 +            /*     #endif */
 +            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +
 +            /*         #if KERNEL_VDW=='LennardJones' */
 +
 +            /* LENNARD-JONES DISPERSION/REPULSION */
 +
 +            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
 +            /*             #define INNERFLOPS INNERFLOPS+2 */
 +            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},rinvsix);
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
 +            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 +                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
 +            /*                     #define INNERFLOPS INNERFLOPS+8 */
 +            /*                 #else */
 +            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
 +            /*                     #define INNERFLOPS INNERFLOPS+3 */
 +            /*                 #endif */
 +            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
 +            /*                 #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
 +            /*                     #define INNERFLOPS INNERFLOPS+2 */
 +            /*                 #endif */
 +            /*             #elif KERNEL_VF=='Force' */
 +            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
 +            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /*         #elif KERNEL_VDW=='CubicSplineTable' */
 +
 +            /* CUBIC SPLINE TABLE DISPERSION */
 +            /*             #if 'Table' in KERNEL_ELEC */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            /*             #endif                     */
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw6            = _mm_mul_ps(c6_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw6            = _mm_mul_ps(c6_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +
 +            /* CUBIC SPLINE TABLE REPULSION */
 +            vfitab           = _mm_add_epi32(vfitab,ifour);
 +            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
 +            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
 +            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
 +            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
 +            _MM_TRANSPOSE4_PS(Y,F,G,H);
 +            Heps             = _mm_mul_ps(vfeps,H);
 +            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
 +            /*             #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
 +            vvdw12           = _mm_mul_ps(c12_{I}{J},VV);
 +            /*                 #define INNERFLOPS INNERFLOPS+3 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
 +            fvdw12           = _mm_mul_ps(c12_{I}{J},FF);
 +            /*                 #define INNERFLOPS INNERFLOPS+5 */
 +            /*             #endif */
 +            /*             #if 'Potential' in KERNEL_VF */
 +            vvdw             = _mm_add_ps(vvdw12,vvdw6);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'Force' in KERNEL_VF */
 +            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         ## End of check for vdw interaction forms */
 +            /*     #endif */
 +            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
 +
 +            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
 +            d                = _mm_sub_ps(r{I}{J},rswitch);
 +            d                = _mm_max_ps(d,_mm_setzero_ps());
 +            d2               = _mm_mul_ps(d,d);
 +            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
 +            /*         #define INNERFLOPS INNERFLOPS+10 */
 +
 +            /*         #if 'Force' in KERNEL_VF */
 +            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
 +            /*             #define INNERFLOPS INNERFLOPS+5 */
 +            /*         #endif */
 +
 +            /* Evaluate switch function */
 +            /*         #if 'Force' in KERNEL_VF */
 +            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
 +            /*                 #define INNERFLOPS INNERFLOPS+4 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'Potential' in KERNEL_VF */
 +            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
 +            velec            = _mm_mul_ps(velec,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
 +            vvdw             = _mm_mul_ps(vvdw,sw);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*     #endif */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
 +            /*         #define INNERFLOPS INNERFLOPS+1 */
 +            /*     #endif */
 +
 +            /*     #if 'Potential' in KERNEL_VF */
 +            /* Update potential sum for this i atom from the interaction with this j atom. */
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            velec            = _mm_and_ps(velec,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            velec            = _mm_andnot_ps(dummy_mask,velec);
 +            /*             #endif */
 +            velecsum         = _mm_add_ps(velecsum,velec);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
 +            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
 +            vgb              = _mm_and_ps(vgb,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vgb              = _mm_andnot_ps(dummy_mask,vgb);
 +            /*             #endif */
 +            vgbsum           = _mm_add_ps(vgbsum,vgb);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif */
 +            /*         #endif */
 +            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +            /*             #if ROUND == 'Epilogue' */
 +            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 +            /*             #endif */
 +            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #endif */
 +            /*     #endif */
 +
 +            /*     #if 'Force' in KERNEL_VF */
 +
 +            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = _mm_add_ps(felec,fvdw);
 +            /*             #define INNERFLOPS INNERFLOPS+1 */
 +            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
 +            fscal            = felec;
 +            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
 +            fscal            = fvdw;
 +            /*        #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            fscal            = _mm_and_ps(fscal,cutoff_mask);
 +            /*                 #define INNERFLOPS INNERFLOPS+1 */
 +            /*             #endif                                       */
 +
 +            /*             #if ROUND == 'Epilogue' */
 +            fscal            = _mm_andnot_ps(dummy_mask,fscal);
 +            /*             #endif */
 +
 +            /* Calculate temporary vectorial force */
 +            tx               = _mm_mul_ps(fscal,dx{I}{J});
 +            ty               = _mm_mul_ps(fscal,dy{I}{J});
 +            tz               = _mm_mul_ps(fscal,dz{I}{J});
 +
 +            /* Update vectorial force */
 +            fix{I}             = _mm_add_ps(fix{I},tx);
 +            fiy{I}             = _mm_add_ps(fiy{I},ty);
 +            fiz{I}             = _mm_add_ps(fiz{I},tz);
 +            /*             #define INNERFLOPS INNERFLOPS+6 */
 +
 +            /* #if GEOMETRY_I == 'Particle'             */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #else                                    */
 +            fjx{J}             = _mm_add_ps(fjx{J},tx);
 +            fjy{J}             = _mm_add_ps(fjy{J},ty);
 +            fjz{J}             = _mm_add_ps(fjz{J},tz);
 +            /*     #define INNERFLOPS INNERFLOPS+3      */
 +            /* #endif                                   */
 +
 +            /*     #endif */
 +
 +            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
 +            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
 +            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
 +            {
 +                /*     #endif */
 +            }
 +            /*     #endif */
 +            /*    ## End of check for the interaction being outside the cutoff */
 +
 +            /* #endfor */
 +            /* ## End of loop over i-j interaction pairs */
 +
 +            /* #if GEOMETRY_I != 'Particle' */
 +            /*     #if ROUND == 'Loop' */
 +            fjptrA             = f+j_coord_offsetA;
 +            fjptrB             = f+j_coord_offsetB;
 +            fjptrC             = f+j_coord_offsetC;
 +            fjptrD             = f+j_coord_offsetD;
 +            /*     #else */
 +            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 +            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 +            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 +            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 +            /*     #endif */
 +            /* #endif */
 +
 +          /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
 +          gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 +            /* #elif GEOMETRY_J == 'Water3'               */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /* #elif GEOMETRY_J == 'Water4'             */
 +            /*     #if 0 in PARTICLES_J                 */
 +            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 +                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 +                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+12     */
 +            /*     #else                                */
 +            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
 +                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 +            /*     #define INNERFLOPS INNERFLOPS+9      */
 +            /*     #endif                               */
 +            /* #endif                                   */
 +
 +            /* Inner loop uses {INNERFLOPS} flops */
 +        }
 +
 +        /* #endfor */
 +
 +        /* End of innermost loop */
 +
 +        /* #if 'Force' in KERNEL_VF */
 +        /*     #if GEOMETRY_I == 'Particle'            */
 +        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+6     */
 +        /*     #elif GEOMETRY_I == 'Water3'            */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*         #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*     #elif GEOMETRY_I == 'Water4'            */
 +        /*         #if 0 in PARTICLES_I                */
 +        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+24    */
 +        /*         #else                               */
 +        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
 +                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
 +        /*             #define OUTERFLOPS OUTERFLOPS+18    */
 +        /*         #endif                              */
 +        /*     #endif                                  */
 +        /* #endif                                      */
 +
 +        /* #if 'Potential' in KERNEL_VF */
 +        ggid                        = gid[iidx];
 +        /* Update potential energies */
 +        /*     #if KERNEL_ELEC != 'None' */
 +        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
 +        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /*     #if KERNEL_VDW != 'None' */
 +        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 +        /*         #define OUTERFLOPS OUTERFLOPS+1 */
 +        /*     #endif */
 +        /* #endif */
 +        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
 +        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
 +        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
 +        /*     #endif */
 +
 +        /* Increment number of inner iterations */
 +        inneriter                  += j_index_end - j_index_start;
 +
 +        /* Outer loop uses {OUTERFLOPS} flops */
 +    }
 +
 +    /* Increment number of outer iterations */
 +    outeriter        += nri;
 +
 +    /* Update outer/inner flops */
 +    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
 +    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
 +    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
 +    /* #if GEOMETRY_I == 'Water3'            */
 +    /*     #define ISUFFIX '_W3'             */
 +    /* #elif GEOMETRY_I == 'Water4'          */
 +    /*     #define ISUFFIX '_W4'             */
 +    /* #else                                 */
 +    /*     #define ISUFFIX ''                */
 +    /* #endif                                */
 +    /* #if GEOMETRY_J == 'Water3'            */
 +    /*     #define JSUFFIX 'W3'              */
 +    /* #elif GEOMETRY_J == 'Water4'          */
 +    /*     #define JSUFFIX 'W4'              */
 +    /* #else                                 */
 +    /*     #define JSUFFIX ''                */
 +    /* #endif                                */
 +    /* #if 'PotentialAndForce' in KERNEL_VF  */
 +    /*     #define VFSUFFIX  '_VF'           */
 +    /* #elif 'Potential' in KERNEL_VF        */
 +    /*     #define VFSUFFIX '_V'             */
 +    /* #else                                 */
 +    /*     #define VFSUFFIX '_F'             */
 +    /* #endif                                */
 +
 +    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #elif KERNEL_ELEC != 'None' */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #else */
 +    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
 +    /* #endif  */
 +}
index d57d9ac7e85c3f72bb21549a300c1cfe077d74ec,0000000000000000000000000000000000000000..9fa8a05b486e5c9dc7c565957c76593b60925116
mode 100644,000000..100644
--- /dev/null
@@@ -1,713 -1,0 +1,715 @@@
-                         "the number of orientation restraints");
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-  
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "vec.h"
 +#include "nrjac.h"
 +#include "network.h"
 +#include "orires.h"
 +#include "do_fit.h"
 +#include "main.h"
 +#include "copyrite.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +
 +void init_orires(FILE *fplog,const gmx_mtop_t *mtop,
 +                 rvec xref[],
 +                 const t_inputrec *ir,
 +                 const gmx_multisim_t *ms,t_oriresdata *od,
 +                 t_state *state)
 +{
 +    int    i,j,d,ex,nmol,nr,*nr_ex;
 +    double mtot;
 +    rvec   com;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    gmx_mtop_atomloop_all_t aloop;
 +    t_atom *atom;
 +
 +    od->fc  = ir->orires_fc;
 +    od->nex = 0;
 +    od->S   = NULL;
 +
 +    od->M=NULL;
 +    od->eig=NULL;
 +    od->v=NULL;
 +
 +    od->nr = gmx_mtop_ftype_count(mtop,F_ORIRES);
 +    if (od->nr == 0)
 +    {
 +        return;
 +    }
 +    
 +    nr_ex = NULL;
 +    
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        for(i=0; i<il[F_ORIRES].nr; i+=3)
 +        {
 +            ex = mtop->ffparams.iparams[il[F_ORIRES].iatoms[i]].orires.ex;
 +            if (ex >= od->nex)
 +            {
 +                srenew(nr_ex,ex+1);
 +                for(j=od->nex; j<ex+1; j++)
 +                {
 +                    nr_ex[j] = 0;
 +            }
 +                od->nex = ex+1;
 +            }
 +            nr_ex[ex]++;
 +        }
 +    }
 +    snew(od->S,od->nex);
 +    /* When not doing time averaging, the instaneous and time averaged data
 +     * are indentical and the pointers can point to the same memory.
 +     */
 +    snew(od->Dinsl,od->nr);
 +    if (ms)
 +    {
 +        snew(od->Dins,od->nr);
 +    }
 +    else
 +    {
 +        od->Dins = od->Dinsl;
 +    }
 +
 +    if (ir->orires_tau == 0)
 +    {
 +        od->Dtav = od->Dins;
 +        od->edt  = 0.0;
 +        od->edt_1= 1.0;
 +    }
 +    else
 +    {
 +        snew(od->Dtav,od->nr);
 +        od->edt  = exp(-ir->delta_t/ir->orires_tau);
 +        od->edt_1= 1.0 - od->edt;
 +
 +        /* Extend the state with the orires history */
 +        state->flags |= (1<<estORIRE_INITF);
 +        state->hist.orire_initf = 1;
 +        state->flags |= (1<<estORIRE_DTAV);
 +        state->hist.norire_Dtav = od->nr*5;
 +        snew(state->hist.orire_Dtav,state->hist.norire_Dtav);
 +    }
 +
 +    snew(od->oinsl,od->nr);
 +    if (ms)
 +    {
 +        snew(od->oins,od->nr);
 +    }
 +    else
 +    {
 +        od->oins = od->oinsl;
 +    }
 +    if (ir->orires_tau == 0) {
 +        od->otav = od->oins;
 +    }
 +    else
 +    {
 +        snew(od->otav,od->nr);
 +    }
 +    snew(od->tmp,od->nex);
 +    snew(od->TMP,od->nex);
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        snew(od->TMP[ex],5);
 +        for(i=0; i<5; i++)
 +        {
 +            snew(od->TMP[ex][i],5);
 +        }
 +    }
 +    
 +    od->nref = 0;
 +    for(i=0; i<mtop->natoms; i++)
 +    {
 +        if (ggrpnr(&mtop->groups,egcORFIT,i) == 0)
 +        {
 +            od->nref++;
 +        }
 +    }
 +    snew(od->mref,od->nref);
 +    snew(od->xref,od->nref);
 +    snew(od->xtmp,od->nref);
 +    
 +    snew(od->eig,od->nex*12);
 +    
 +    /* Determine the reference structure on the master node.
 +     * Copy it to the other nodes after checking multi compatibility,
 +     * so we are sure the subsystems match before copying.
 +     */
 +    clear_rvec(com);
 +    mtot = 0.0;
 +    j = 0;
 +    aloop = gmx_mtop_atomloop_all_init(mtop);
 +    while(gmx_mtop_atomloop_all_next(aloop,&i,&atom))
 +    {
 +        if (mtop->groups.grpnr[egcORFIT] == NULL ||
 +            mtop->groups.grpnr[egcORFIT][i] == 0)
 +        {
 +            /* Not correct for free-energy with changing masses */
 +            od->mref[j] = atom->m;
 +            if (ms==NULL || MASTERSIM(ms))
 +            {
 +                copy_rvec(xref[i],od->xref[j]);
 +                for(d=0; d<DIM; d++)
 +                {
 +                    com[d] += od->mref[j]*xref[i][d];
 +                }
 +            }
 +            mtot += od->mref[j];
 +            j++;
 +        }
 +    }
 +    svmul(1.0/mtot,com,com);
 +    if (ms==NULL || MASTERSIM(ms))
 +    {
 +        for(j=0; j<od->nref; j++)
 +        {
 +            rvec_dec(od->xref[j],com);
 +        }
 +    }
 +    
 +    fprintf(fplog,"Found %d orientation experiments\n",od->nex);
 +    for(i=0; i<od->nex; i++)
 +    {
 +        fprintf(fplog,"  experiment %d has %d restraints\n",i+1,nr_ex[i]);
 +    }
 +    
 +    sfree(nr_ex);
 +    
 +    fprintf(fplog,"  the fit group consists of %d atoms and has total mass %g\n",
 +            od->nref,mtot);
 +    
 +    if (ms)
 +    {
 +        fprintf(fplog,"  the orientation restraints are ensemble averaged over %d systems\n",ms->nsim);
 +        
 +        check_multi_int(fplog,ms,od->nr,
-                         "the number of fit atoms for orientation restraining");
-         check_multi_int(fplog,ms,ir->nsteps,"nsteps");
++                        "the number of orientation restraints",
++                        FALSE);
 +        check_multi_int(fplog,ms,od->nref,
++                        "the number of fit atoms for orientation restraining",
++                        FALSE);
++        check_multi_int(fplog,ms,ir->nsteps,"nsteps",FALSE);
 +        /* Copy the reference coordinates from the master to the other nodes */
 +        gmx_sum_sim(DIM*od->nref,od->xref[0],ms);
 +    }
 +    
 +    please_cite(fplog,"Hess2003");
 +}
 +
 +void diagonalize_orires_tensors(t_oriresdata *od)
 +{
 +    int           ex,i,j,nrot,ord[DIM],t;
 +    matrix        S,TMP;
 +    
 +    if (od->M == NULL)
 +    {
 +        snew(od->M,DIM);
 +        for(i=0; i<DIM; i++)
 +        {
 +            snew(od->M[i],DIM);
 +        }
 +        snew(od->eig_diag,DIM);
 +        snew(od->v,DIM);
 +        for(i=0; i<DIM; i++)
 +        {
 +            snew(od->v[i],DIM);
 +        }
 +    }
 +
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        /* Rotate the S tensor back to the reference frame */
 +        mmul(od->R,od->S[ex],TMP);
 +        mtmul(TMP,od->R,S);
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                od->M[i][j] = S[i][j];
 +            }
 +        }
 +        
 +        jacobi(od->M,DIM,od->eig_diag,od->v,&nrot);
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            ord[i] = i;
 +        }
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=i+1; j<DIM; j++)
 +            {
 +                if (sqr(od->eig_diag[ord[j]]) > sqr(od->eig_diag[ord[i]]))
 +                {
 +                    t = ord[i];
 +                    ord[i] = ord[j];
 +                    ord[j] = t;
 +                }
 +            }
 +        }
 +            
 +        for(i=0; i<DIM; i++)
 +        {
 +            od->eig[ex*12 + i] = od->eig_diag[ord[i]];
 +        }
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                od->eig[ex*12 + 3 + 3*i + j] = od->v[j][ord[i]];
 +            }
 +        }
 +    }
 +}
 +
 +void print_orires_log(FILE *log,t_oriresdata *od)
 +{
 +    int  ex,i;
 +    real *eig;      
 +    
 +    diagonalize_orires_tensors(od);
 +    
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        eig = od->eig + ex*12;
 +        fprintf(log,"  Orientation experiment %d:\n",ex+1);
 +        fprintf(log,"    order parameter: %g\n",eig[0]);
 +        for(i=0; i<DIM; i++)
 +        {
 +            fprintf(log,"    eig: %6.3f   %6.3f %6.3f %6.3f\n",
 +                    (eig[0] != 0) ? eig[i]/eig[0] : eig[i],
 +                    eig[DIM+i*DIM+XX],
 +                    eig[DIM+i*DIM+YY],
 +                    eig[DIM+i*DIM+ZZ]);
 +        }
 +        fprintf(log,"\n");
 +    }
 +}
 +
 +real calc_orires_dev(const gmx_multisim_t *ms,
 +                     int nfa,const t_iatom forceatoms[],const t_iparams ip[],
 +                     const t_mdatoms *md,const rvec x[],const t_pbc *pbc,
 +                     t_fcdata *fcd,history_t *hist)
 +{
 +    int          fa,d,i,j,type,ex,nref;
 +    real         edt,edt_1,invn,pfac,r2,invr,corrfac,weight,wsv2,sw,dev;
 +    tensor       *S,R,TMP;
 +    rvec5        *Dinsl,*Dins,*Dtav,*rhs;
 +    real         *mref,***T;
 +    double       mtot;
 +    rvec         *xref,*xtmp,com,r_unrot,r;
 +    t_oriresdata *od;
 +    gmx_bool         bTAV;
 +    const real   two_thr=2.0/3.0;
 +    
 +    od = &(fcd->orires);
 +
 +    if (od->nr == 0)
 +    {
 +        /* This means that this is not the master node */
 +        gmx_fatal(FARGS,"Orientation restraints are only supported on the master node, use less processors");
 +    }
 +    
 +    bTAV = (od->edt != 0);
 +    edt  = od->edt;
 +    edt_1= od->edt_1;
 +    S    = od->S;
 +    Dinsl= od->Dinsl;
 +    Dins = od->Dins;
 +    Dtav = od->Dtav;
 +    T    = od->TMP;
 +    rhs  = od->tmp;
 +    nref = od->nref;
 +    mref = od->mref;
 +    xref = od->xref;
 +    xtmp = od->xtmp;
 +    
 +    if (bTAV)
 +    {
 +        od->exp_min_t_tau = hist->orire_initf*edt;
 +        
 +        /* Correction factor to correct for the lack of history
 +         * at short times.
 +         */
 +        corrfac = 1.0/(1.0 - od->exp_min_t_tau);
 +    }
 +    else
 +    {
 +        corrfac = 1.0;
 +    }
 +
 +    if (ms)
 +    {
 +        invn = 1.0/ms->nsim;
 +    }
 +    else
 +    {
 +        invn = 1.0;
 +    }
 +    
 +    clear_rvec(com);
 +    mtot = 0;
 +    j=0;
 +    for(i=0; i<md->nr; i++)
 +    {
 +        if (md->cORF[i] == 0)
 +        {
 +            copy_rvec(x[i],xtmp[j]);
 +            mref[j] = md->massT[i];
 +            for(d=0; d<DIM; d++)
 +            {
 +                com[d] += mref[j]*xref[j][d];
 +            }
 +            mtot += mref[j];
 +            j++;
 +        }
 +    }
 +    svmul(1.0/mtot,com,com);
 +    for(j=0; j<nref; j++)
 +    {
 +        rvec_dec(xtmp[j],com);
 +    }
 +    /* Calculate the rotation matrix to rotate x to the reference orientation */
 +    calc_fit_R(DIM,nref,mref,xref,xtmp,R);
 +    copy_mat(R,od->R);
 +    
 +    d = 0;
 +    for(fa=0; fa<nfa; fa+=3)
 +    {
 +        type = forceatoms[fa];
 +        if (pbc)
 +        {
 +            pbc_dx_aiuc(pbc,x[forceatoms[fa+1]],x[forceatoms[fa+2]],r_unrot);
 +        }
 +        else
 +        {
 +            rvec_sub(x[forceatoms[fa+1]],x[forceatoms[fa+2]],r_unrot);
 +        }
 +        mvmul(R,r_unrot,r);
 +        r2   = norm2(r);
 +        invr = gmx_invsqrt(r2);
 +        /* Calculate the prefactor for the D tensor, this includes the factor 3! */
 +        pfac = ip[type].orires.c*invr*invr*3;
 +        for(i=0; i<ip[type].orires.power; i++)
 +        {
 +            pfac *= invr;
 +        }
 +        Dinsl[d][0] = pfac*(2*r[0]*r[0] + r[1]*r[1] - r2);
 +        Dinsl[d][1] = pfac*(2*r[0]*r[1]);
 +        Dinsl[d][2] = pfac*(2*r[0]*r[2]);
 +        Dinsl[d][3] = pfac*(2*r[1]*r[1] + r[0]*r[0] - r2);
 +        Dinsl[d][4] = pfac*(2*r[1]*r[2]);
 +        
 +        if (ms)
 +        {
 +            for(i=0; i<5; i++)
 +            {
 +                Dins[d][i] = Dinsl[d][i]*invn;
 +            }
 +        }
 +
 +        d++;
 +    }
 +  
 +    if (ms)
 +    {
 +        gmx_sum_sim(5*od->nr,Dins[0],ms);
 +    }
 +   
 +    /* Calculate the order tensor S for each experiment via optimization */
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        for(i=0; i<5; i++)
 +        {
 +            rhs[ex][i] = 0;
 +            for(j=0; j<=i; j++)
 +            {
 +                T[ex][i][j] = 0;
 +            }
 +        }
 +    }
 +    d = 0;
 +    for(fa=0; fa<nfa; fa+=3)
 +    {
 +        if (bTAV)
 +        {
 +            /* Here we update Dtav in t_fcdata using the data in history_t.
 +             * Thus the results stay correct when this routine
 +             * is called multiple times.
 +             */
 +            for(i=0; i<5; i++)
 +            {
 +                Dtav[d][i] = edt*hist->orire_Dtav[d*5+i] + edt_1*Dins[d][i];
 +            }
 +        }
 +        
 +        type   = forceatoms[fa];
 +        ex     = ip[type].orires.ex;
 +        weight = ip[type].orires.kfac;
 +        /* Calculate the vector rhs and half the matrix T for the 5 equations */
 +        for(i=0; i<5; i++) {
 +            rhs[ex][i] += Dtav[d][i]*ip[type].orires.obs*weight;
 +            for(j=0; j<=i; j++)
 +            {
 +                T[ex][i][j] += Dtav[d][i]*Dtav[d][j]*weight;
 +            }
 +        }
 +        d++;
 +    }
 +    /* Now we have all the data we can calculate S */
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        /* Correct corrfac and copy one half of T to the other half */
 +        for(i=0; i<5; i++)
 +        {
 +            rhs[ex][i]  *= corrfac;
 +            T[ex][i][i] *= sqr(corrfac);
 +            for(j=0; j<i; j++)
 +            {
 +                T[ex][i][j] *= sqr(corrfac);
 +                T[ex][j][i]  = T[ex][i][j];
 +            }
 +        }
 +        m_inv_gen(T[ex],5,T[ex]);
 +        /* Calculate the orientation tensor S for this experiment */
 +        S[ex][0][0] = 0;
 +        S[ex][0][1] = 0;
 +        S[ex][0][2] = 0;
 +        S[ex][1][1] = 0;
 +        S[ex][1][2] = 0;
 +        for(i=0; i<5; i++)
 +        {
 +            S[ex][0][0] += 1.5*T[ex][0][i]*rhs[ex][i];
 +            S[ex][0][1] += 1.5*T[ex][1][i]*rhs[ex][i];
 +            S[ex][0][2] += 1.5*T[ex][2][i]*rhs[ex][i];
 +            S[ex][1][1] += 1.5*T[ex][3][i]*rhs[ex][i];
 +            S[ex][1][2] += 1.5*T[ex][4][i]*rhs[ex][i];
 +        }
 +        S[ex][1][0] = S[ex][0][1];
 +        S[ex][2][0] = S[ex][0][2];
 +        S[ex][2][1] = S[ex][1][2];
 +        S[ex][2][2] = -S[ex][0][0] - S[ex][1][1];
 +    }
 +    
 +    wsv2 = 0;
 +    sw   = 0;
 +    
 +    d = 0;
 +    for(fa=0; fa<nfa; fa+=3)
 +    {
 +        type = forceatoms[fa];
 +        ex = ip[type].orires.ex;
 +        
 +        od->otav[d] = two_thr*
 +            corrfac*(S[ex][0][0]*Dtav[d][0] + S[ex][0][1]*Dtav[d][1] +
 +                     S[ex][0][2]*Dtav[d][2] + S[ex][1][1]*Dtav[d][3] +
 +                     S[ex][1][2]*Dtav[d][4]);
 +        if (bTAV)
 +        {
 +            od->oins[d] = two_thr*(S[ex][0][0]*Dins[d][0] + S[ex][0][1]*Dins[d][1] +
 +                                   S[ex][0][2]*Dins[d][2] + S[ex][1][1]*Dins[d][3] +
 +                                   S[ex][1][2]*Dins[d][4]);
 +        }
 +        if (ms)
 +        {
 +            /* When ensemble averaging is used recalculate the local orientation
 +             * for output to the energy file.
 +             */
 +            od->oinsl[d] = two_thr*
 +                (S[ex][0][0]*Dinsl[d][0] + S[ex][0][1]*Dinsl[d][1] +
 +                 S[ex][0][2]*Dinsl[d][2] + S[ex][1][1]*Dinsl[d][3] +
 +                 S[ex][1][2]*Dinsl[d][4]);
 +        }
 +        
 +        dev = od->otav[d] - ip[type].orires.obs;
 +        
 +        wsv2 += ip[type].orires.kfac*sqr(dev);
 +        sw   += ip[type].orires.kfac;
 +        
 +        d++;
 +    }
 +    od->rmsdev = sqrt(wsv2/sw);
 +    
 +    /* Rotate the S matrices back, so we get the correct grad(tr(S D)) */
 +    for(ex=0; ex<od->nex; ex++)
 +    {
 +        tmmul(R,S[ex],TMP);
 +        mmul(TMP,R,S[ex]);
 +    }
 +    
 +    return od->rmsdev;
 +    
 +    /* Approx. 120*nfa/3 flops */
 +}
 +
 +real orires(int nfa,const t_iatom forceatoms[],const t_iparams ip[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +    atom_id      ai,aj;
 +    int          fa,d,i,type,ex,power,ki=CENTRAL;
 +    ivec         dt;
 +    real         r2,invr,invr2,fc,smooth_fc,dev,devins,pfac;
 +    rvec         r,Sr,fij;
 +    real         vtot;
 +    const t_oriresdata *od;
 +    gmx_bool         bTAV;
 +    
 +    vtot = 0;
 +    od = &(fcd->orires);
 +    
 +    if (od->fc != 0)
 +    {
 +        bTAV = (od->edt != 0);
 +
 +        smooth_fc = od->fc;
 +        if (bTAV)
 +        {
 +            /* Smoothly switch on the restraining when time averaging is used */
 +            smooth_fc *= (1.0 - od->exp_min_t_tau);
 +        }
 +        
 +        d = 0;
 +        for(fa=0; fa<nfa; fa+=3)
 +        {
 +            type  = forceatoms[fa];
 +            ai    = forceatoms[fa+1];
 +            aj    = forceatoms[fa+2];
 +            if (pbc)
 +            {
 +                ki = pbc_dx_aiuc(pbc,x[ai],x[aj],r);
 +            }
 +            else
 +            {
 +                rvec_sub(x[ai],x[aj],r);
 +            }
 +            r2    = norm2(r);
 +            invr  = gmx_invsqrt(r2);
 +            invr2 = invr*invr;
 +            ex    = ip[type].orires.ex;
 +            power = ip[type].orires.power;
 +            fc    = smooth_fc*ip[type].orires.kfac;
 +            dev   = od->otav[d] - ip[type].orires.obs;
 +            
 +            /* NOTE:
 +             * there is no real potential when time averaging is applied
 +             */
 +            vtot += 0.5*fc*sqr(dev);
 +            
 +            if (bTAV)
 +            {
 +                /* Calculate the force as the sqrt of tav times instantaneous */
 +                devins = od->oins[d] - ip[type].orires.obs;
 +                if (dev*devins <= 0)
 +                {
 +                    dev = 0;
 +                }
 +                else
 +                {
 +                    dev = sqrt(dev*devins);
 +                    if (devins < 0)
 +                    {
 +                        dev = -dev;
 +                    }
 +                }
 +            }
 +            
 +            pfac  = fc*ip[type].orires.c*invr2;
 +            for(i=0; i<power; i++)
 +            {
 +                pfac *= invr;
 +            }
 +            mvmul(od->S[ex],r,Sr);
 +            for(i=0; i<DIM; i++)
 +            {
 +                fij[i] =
 +                    -pfac*dev*(4*Sr[i] - 2*(2+power)*invr2*iprod(Sr,r)*r[i]);
 +            }
 +            
 +            if (g)
 +            {
 +                ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +                ki=IVEC2IS(dt);
 +            }
 +            
 +            for(i=0; i<DIM; i++)
 +            {
 +                f[ai][i]           += fij[i];
 +                f[aj][i]           -= fij[i];
 +                fshift[ki][i]      += fij[i];
 +                fshift[CENTRAL][i] -= fij[i];
 +            }
 +            d++;
 +        }
 +    }
 +    
 +    return vtot;
 +    
 +    /* Approx. 80*nfa/3 flops */
 +}
 +
 +void update_orires_history(t_fcdata *fcd,history_t *hist)
 +{
 +    t_oriresdata *od;
 +    int pair,i;
 +    
 +    od = &(fcd->orires);
 +    if (od->edt != 0)
 +    {
 +        /* Copy the new time averages that have been calculated
 +         *  in calc_orires_dev.
 +         */
 +        hist->orire_initf = od->exp_min_t_tau;
 +        for(pair=0; pair<od->nr; pair++)
 +        {
 +            for(i=0; i<5; i++)
 +            {
 +                hist->orire_Dtav[pair*5+i] = od->Dtav[pair][i];
 +            }
 +        }
 +    }
 +}
index dc6eebf60822c24d9f34aa7953ce415190c589a4,0000000000000000000000000000000000000000..bd7a9698afe4f265c61695a90ad992274d383814
mode 100644,000000..100644
--- /dev/null
@@@ -1,2944 -1,0 +1,2999 @@@
- static const int tpx_version = 91;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "symtab.h"
 +#include "futil.h"
 +#include "filenm.h"
 +#include "gmxfio.h"
 +#include "topsort.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "confio.h"
 +#include "atomprop.h"
 +#include "copyrite.h"
 +#include "vec.h"
 +#include "mtop_util.h"
 +
 +#define TPX_TAG_RELEASE  "release"
 +
 +/* This is the tag string which is stored in the tpx file.
 + * Change this if you want to change the tpx format in a feature branch.
 + * This ensures that there will not be different tpx formats around which
 + * can not be distinguished.
 + */
 +static const char *tpx_tag = TPX_TAG_RELEASE;
 +
 +/* This number should be increased whenever the file format changes! */
- static void do_expandedvals(t_fileio *fio,t_expanded *expand,int n_lambda, gmx_bool bRead, int file_version)
++static const int tpx_version = 92;
 +
 +/* This number should only be increased when you edit the TOPOLOGY section
 + * or the HEADER of the tpx format.
 + * This way we can maintain forward compatibility too for all analysis tools
 + * and/or external programs that only need to know the atom/residue names,
 + * charges, and bond connectivity.
 + *  
 + * It first appeared in tpx version 26, when I also moved the inputrecord
 + * to the end of the tpx file, so we can just skip it if we only
 + * want the topology.
 + */
 +static const int tpx_generation = 25;
 +
 +/* This number should be the most recent backwards incompatible version 
 + * I.e., if this number is 9, we cannot read tpx version 9 with this code.
 + */
 +static const int tpx_incompatible_version = 9;
 +
 +
 +
 +/* Struct used to maintain tpx compatibility when function types are added */
 +typedef struct {
 +  int fvnr; /* file version number in which the function type first appeared */
 +  int ftype; /* function type */
 +} t_ftupd;
 +
 +/* 
 + *The entries should be ordered in:
 + * 1. ascending file version number
 + * 2. ascending function type number
 + */
 +/*static const t_ftupd ftupd[] = {
 +  { 20, F_CUBICBONDS        },
 +  { 20, F_CONNBONDS         },
 +  { 20, F_HARMONIC          },
 +  { 20, F_EQM,              },
 +  { 22, F_DISRESVIOL        },
 +  { 22, F_ORIRES            },
 +  { 22, F_ORIRESDEV         },
 +  { 26, F_FOURDIHS          },
 +  { 26, F_PIDIHS            },
 +  { 26, F_DIHRES            },
 +  { 26, F_DIHRESVIOL        },
 +  { 30, F_CROSS_BOND_BONDS  },
 +  { 30, F_CROSS_BOND_ANGLES },
 +  { 30, F_UREY_BRADLEY      },
 +  { 30, F_POLARIZATION      },
 +  { 54, F_DHDL_CON          },
 +  };*/
 +/* 
 + *The entries should be ordered in:
 + * 1. ascending function type number
 + * 2. ascending file version number
 + */
 +/* question; what is the purpose of the commented code above? */
 +static const t_ftupd ftupd[] = {
 +  { 20, F_CUBICBONDS        },
 +  { 20, F_CONNBONDS         },
 +  { 20, F_HARMONIC          },
 +  { 34, F_FENEBONDS         },
 +  { 43, F_TABBONDS          },
 +  { 43, F_TABBONDSNC        },
 +  { 70, F_RESTRBONDS        },
 +  { 76, F_LINEAR_ANGLES     },
 +  { 30, F_CROSS_BOND_BONDS  },
 +  { 30, F_CROSS_BOND_ANGLES },
 +  { 30, F_UREY_BRADLEY      },
 +  { 34, F_QUARTIC_ANGLES    },
 +  { 43, F_TABANGLES         },
 +  { 26, F_FOURDIHS          },
 +  { 26, F_PIDIHS            },
 +  { 43, F_TABDIHS           },
 +  { 65, F_CMAP              },
 +  { 60, F_GB12              },
 +  { 61, F_GB13              },
 +  { 61, F_GB14              },        
 +  { 72, F_GBPOL             },
 +  { 72, F_NPSOLVATION       },
 +  { 41, F_LJC14_Q           },
 +  { 41, F_LJC_PAIRS_NB      },
 +  { 32, F_BHAM_LR           },
 +  { 32, F_RF_EXCL           },
 +  { 32, F_COUL_RECIP        },
 +  { 46, F_DPD               },
 +  { 30, F_POLARIZATION      },
 +  { 36, F_THOLE_POL         },
 +  { 80, F_FBPOSRES          },
 +  { 22, F_DISRESVIOL        },
 +  { 22, F_ORIRES            },
 +  { 22, F_ORIRESDEV         },
 +  { 26, F_DIHRES            },
 +  { 26, F_DIHRESVIOL        },
 +  { 49, F_VSITE4FDN         },
 +  { 50, F_VSITEN            },
 +  { 46, F_COM_PULL          },
 +  { 20, F_EQM               },
 +  { 46, F_ECONSERVED        },
 +  { 69, F_VTEMP_NOLONGERUSED},
 +  { 66, F_PDISPCORR         },
 +  { 54, F_DHDL_CON          },
 +  { 76, F_ANHARM_POL        },
 +  { 79, F_DVDL_COUL         },
 +  { 79, F_DVDL_VDW,         },
 +  { 79, F_DVDL_BONDED,      },
 +  { 79, F_DVDL_RESTRAINT    },
 +  { 79, F_DVDL_TEMPERATURE  },
 +  { 54, F_DHDL_CON          }
 +};
 +#define NFTUPD asize(ftupd)
 +
 +/* Needed for backward compatibility */
 +#define MAXNODES 256
 +
 +static void _do_section(t_fileio *fio,int key,gmx_bool bRead,const char *src,
 +                        int line)
 +{
 +  char buf[STRLEN];
 +  gmx_bool bDbg;
 +
 +  if (gmx_fio_getftp(fio) == efTPA) {
 +    if (!bRead) {
 +      gmx_fio_write_string(fio,itemstr[key]);
 +      bDbg       = gmx_fio_getdebug(fio);
 +      gmx_fio_setdebug(fio,FALSE);
 +      gmx_fio_write_string(fio,comment_str[key]);
 +      gmx_fio_setdebug(fio,bDbg);
 +    }
 +    else {
 +      if (gmx_fio_getdebug(fio))
 +      fprintf(stderr,"Looking for section %s (%s, %d)",
 +              itemstr[key],src,line);
 +      
 +      do {
 +      gmx_fio_do_string(fio,buf);
 +      } while ((gmx_strcasecmp(buf,itemstr[key]) != 0));
 +      
 +      if (gmx_strcasecmp(buf,itemstr[key]) != 0) 
 +      gmx_fatal(FARGS,"\nCould not find section heading %s",itemstr[key]);
 +      else if (gmx_fio_getdebug(fio))
 +      fprintf(stderr," and found it\n");
 +    }
 +  }
 +}
 +
 +#define do_section(fio,key,bRead) _do_section(fio,key,bRead,__FILE__,__LINE__)
 +
 +/**************************************************************
 + *
 + * Now the higer level routines that do io of the structures and arrays
 + *
 + **************************************************************/
 +static void do_pullgrp(t_fileio *fio, t_pullgrp *pgrp, gmx_bool bRead, 
 +                       int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +  int  i;
 +
 +  gmx_fio_do_int(fio,pgrp->nat);
 +  if (bRead)
 +    snew(pgrp->ind,pgrp->nat);
 +  bDum=gmx_fio_ndo_int(fio,pgrp->ind,pgrp->nat);
 +  gmx_fio_do_int(fio,pgrp->nweight);
 +  if (bRead)
 +    snew(pgrp->weight,pgrp->nweight);
 +  bDum=gmx_fio_ndo_real(fio,pgrp->weight,pgrp->nweight);
 +  gmx_fio_do_int(fio,pgrp->pbcatom);
 +  gmx_fio_do_rvec(fio,pgrp->vec);
 +  gmx_fio_do_rvec(fio,pgrp->init);
 +  gmx_fio_do_real(fio,pgrp->rate);
 +  gmx_fio_do_real(fio,pgrp->k);
 +  if (file_version >= 56) {
 +    gmx_fio_do_real(fio,pgrp->kB);
 +  } else {
 +    pgrp->kB = pgrp->k;
 +  }
 +}
 +
-       snew(fepvals->all_lambda,efptNR);
++static void do_expandedvals(t_fileio *fio,t_expanded *expand,t_lambda *fepvals, gmx_bool bRead, int file_version)
 +{
 +  /* i is used in the ndo_double macro*/
 +  int i;
 +  real fv;
 +  gmx_bool bDum=TRUE;
 +  real rdum;
++  int n_lambda=fepvals->n_lambda;
 +
++  /* reset the lambda calculation window */
++  fepvals->lambda_start_n = 0;
++  fepvals->lambda_stop_n = n_lambda;
 +  if (file_version >= 79)
 +  {
 +      if (n_lambda>0)
 +      {
 +          if (bRead)
 +          {
 +              snew(expand->init_lambda_weights,n_lambda);
 +          }
 +          bDum=gmx_fio_ndo_real(fio,expand->init_lambda_weights,n_lambda);
 +          gmx_fio_do_gmx_bool(fio,expand->bInit_weights);
 +      }
 +
 +      gmx_fio_do_int(fio,expand->nstexpanded);
 +      gmx_fio_do_int(fio,expand->elmcmove);
 +      gmx_fio_do_int(fio,expand->elamstats);
 +      gmx_fio_do_int(fio,expand->lmc_repeats);
 +      gmx_fio_do_int(fio,expand->gibbsdeltalam);
 +      gmx_fio_do_int(fio,expand->lmc_forced_nstart);
 +      gmx_fio_do_int(fio,expand->lmc_seed);
 +      gmx_fio_do_real(fio,expand->mc_temp);
 +      gmx_fio_do_int(fio,expand->bSymmetrizedTMatrix);
 +      gmx_fio_do_int(fio,expand->nstTij);
 +      gmx_fio_do_int(fio,expand->minvarmin);
 +      gmx_fio_do_int(fio,expand->c_range);
 +      gmx_fio_do_real(fio,expand->wl_scale);
 +      gmx_fio_do_real(fio,expand->wl_ratio);
 +      gmx_fio_do_real(fio,expand->init_wl_delta);
 +      gmx_fio_do_gmx_bool(fio,expand->bWLoneovert);
 +      gmx_fio_do_int(fio,expand->elmceq);
 +      gmx_fio_do_int(fio,expand->equil_steps);
 +      gmx_fio_do_int(fio,expand->equil_samples);
 +      gmx_fio_do_int(fio,expand->equil_n_at_lam);
 +      gmx_fio_do_real(fio,expand->equil_wl_delta);
 +      gmx_fio_do_real(fio,expand->equil_ratio);
 +  }
 +}
 +
 +static void do_simtempvals(t_fileio *fio,t_simtemp *simtemp, int n_lambda, gmx_bool bRead, 
 +                           int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,simtemp->eSimTempScale);
 +      gmx_fio_do_real(fio,simtemp->simtemp_high);
 +      gmx_fio_do_real(fio,simtemp->simtemp_low);
 +      if (n_lambda>0)
 +      {
 +          if (bRead)
 +          {
 +              snew(simtemp->temperatures,n_lambda);
 +          }
 +          bDum=gmx_fio_ndo_real(fio,simtemp->temperatures,n_lambda);
 +      }
 +  }
 +}
 +
 +static void do_fepvals(t_fileio *fio,t_lambda *fepvals,gmx_bool bRead, int file_version)
 +{
 +  /* i is defined in the ndo_double macro; use g to iterate. */
 +  int i,g;
 +  real fv;
 +  gmx_bool bDum=TRUE;
 +  real rdum;
 +
 +  /* free energy values */
++
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->init_fep_state);
 +      gmx_fio_do_double(fio,fepvals->init_lambda);
 +      gmx_fio_do_double(fio,fepvals->delta_lambda);
 +  }
 +  else if (file_version >= 59) {
 +      gmx_fio_do_double(fio,fepvals->init_lambda);
 +      gmx_fio_do_double(fio,fepvals->delta_lambda);
 +  } else {
 +      gmx_fio_do_real(fio,rdum);
 +      fepvals->init_lambda = rdum;
 +      gmx_fio_do_real(fio,rdum);
 +      fepvals->delta_lambda = rdum;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->n_lambda);
 +      if (bRead)
 +      {
 +          snew(fepvals->all_lambda,efptNR);
 +      }
 +      for (g=0;g<efptNR;g++)
 +      {
 +          if (fepvals->n_lambda > 0) {
 +              if (bRead)
 +              {
 +                  snew(fepvals->all_lambda[g],fepvals->n_lambda);
 +              }
 +              bDum=gmx_fio_ndo_double(fio,fepvals->all_lambda[g],fepvals->n_lambda);
 +              bDum=gmx_fio_ndo_int(fio,fepvals->separate_dvdl,efptNR);
 +          }
 +          else if (fepvals->init_lambda >= 0)
 +          {
 +              fepvals->separate_dvdl[efptFEP] = TRUE;
 +          }
 +      }
 +  }
 +  else if (file_version >= 64)
 +  {
 +      gmx_fio_do_int(fio,fepvals->n_lambda);
-           snew(fepvals->all_lambda[efptFEP],fepvals->n_lambda);
 +      if (bRead)
 +      {
-       bDum=gmx_fio_ndo_double(fio,fepvals->all_lambda[efptFEP],fepvals->n_lambda);
++          int g;
++
++          snew(fepvals->all_lambda,efptNR);
++          /* still allocate the all_lambda array's contents. */
++          for(g=0;g<efptNR;g++)
++          {
++              if (fepvals->n_lambda > 0) {
++                  snew(fepvals->all_lambda[g],fepvals->n_lambda);
++              }
++          }
 +      }
-       }
-       /* still allocate the all_lambda array's contents. */
-       for (g=0;g<efptNR;g++)
-       {
-           if (fepvals->n_lambda > 0) {
-               if (bRead)
++      bDum=gmx_fio_ndo_double(fio,fepvals->all_lambda[efptFEP],
++                              fepvals->n_lambda);
 +      if (fepvals->init_lambda >= 0)
 +      {
++          int g,h;
++
 +          fepvals->separate_dvdl[efptFEP] = TRUE;
-                   snew(fepvals->all_lambda[g],fepvals->n_lambda);
++
++          if (bRead)
++          {
++              /* copy the contents of the efptFEP lambda component to all
++                 the other components */
++              for(g=0;g<efptNR;g++)
 +              {
-         do_expandedvals(fio,ir->expandedvals,ir->fepvals->n_lambda,bRead,file_version);
++                  for(h=0;h<fepvals->n_lambda;h++)
++                  {
++                      if (g!=efptFEP)
++                      {
++                          fepvals->all_lambda[g][h] =
++                                    fepvals->all_lambda[efptFEP][h];
++                      }
++                  }
 +              }
 +          }
 +      }
 +  }
 +  else
 +  {
 +      fepvals->n_lambda = 0;
 +      fepvals->all_lambda   = NULL;
 +      if (fepvals->init_lambda >= 0)
 +      {
 +          fepvals->separate_dvdl[efptFEP] = TRUE;
 +      }
 +  }
 +  if (file_version >= 13)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_alpha);
 +  }
 +  else
 +  {
 +      fepvals->sc_alpha = 0;
 +  }
 +  if (file_version >= 38)
 +  {
 +      gmx_fio_do_int(fio,fepvals->sc_power);
 +  }
 +  else
 +  {
 +      fepvals->sc_power = 2;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_r_power);
 +  }
 +  else
 +  {
 +      fepvals->sc_r_power = 6.0;
 +  }
 +  if (file_version >= 15)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_sigma);
 +  }
 +  else
 +  {
 +      fepvals->sc_sigma = 0.3;
 +  }
 +  if (bRead)
 +  {
 +      if (file_version >= 71)
 +      {
 +          fepvals->sc_sigma_min = fepvals->sc_sigma;
 +      }
 +      else
 +      {
 +          fepvals->sc_sigma_min = 0;
 +      }
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->bScCoul);
 +  }
 +  else
 +  {
 +      fepvals->bScCoul = TRUE;
 +  }
 +  if (file_version >= 64) {
 +      gmx_fio_do_int(fio,fepvals->nstdhdl);
 +  } else {
 +      fepvals->nstdhdl = 1;
 +  }
 +
 +  if (file_version >= 73)
 +  {
 +      gmx_fio_do_int(fio, fepvals->separate_dhdl_file);
 +      gmx_fio_do_int(fio, fepvals->dhdl_derivatives);
 +  }
 +  else
 +  {
 +      fepvals->separate_dhdl_file = esepdhdlfileYES;
 +      fepvals->dhdl_derivatives = edhdlderivativesYES;
 +  }
 +  if (file_version >= 71)
 +  {
 +      gmx_fio_do_int(fio,fepvals->dh_hist_size);
 +      gmx_fio_do_double(fio,fepvals->dh_hist_spacing);
 +  }
 +  else
 +  {
 +      fepvals->dh_hist_size    = 0;
 +      fepvals->dh_hist_spacing = 0.1;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->bPrintEnergy);
 +  }
 +  else
 +  {
 +      fepvals->bPrintEnergy = FALSE;
 +  }
++
++  /* handle lambda_neighbors */
++  if ((file_version >= 83 && file_version < 90) || file_version >= 92 )
++  {
++      gmx_fio_do_int(fio,fepvals->lambda_neighbors);
++      if ( (fepvals->lambda_neighbors >= 0) && (fepvals->init_fep_state>=0) &&
++           (fepvals->init_lambda < 0) )
++      {
++          fepvals->lambda_start_n = (fepvals->init_fep_state -
++                                     fepvals->lambda_neighbors);
++          fepvals->lambda_stop_n = (fepvals->init_fep_state +
++                                    fepvals->lambda_neighbors + 1);
++          if (fepvals->lambda_start_n < 0)
++          {
++              fepvals->lambda_start_n = 0;;
++          }
++          if (fepvals->lambda_stop_n >= fepvals->n_lambda)
++          {
++              fepvals->lambda_stop_n = fepvals->n_lambda;
++          }
++      }
++      else
++      {
++          fepvals->lambda_start_n = 0;
++          fepvals->lambda_stop_n = fepvals->n_lambda;
++      }
++  }
++  else
++  {
++      fepvals->lambda_start_n = 0;
++      fepvals->lambda_stop_n = fepvals->n_lambda;
++  }
 +}
 +
 +static void do_pull(t_fileio *fio, t_pull *pull,gmx_bool bRead, int file_version)
 +{
 +  int g;
 +
 +  gmx_fio_do_int(fio,pull->ngrp);
 +  gmx_fio_do_int(fio,pull->eGeom);
 +  gmx_fio_do_ivec(fio,pull->dim);
 +  gmx_fio_do_real(fio,pull->cyl_r1);
 +  gmx_fio_do_real(fio,pull->cyl_r0);
 +  gmx_fio_do_real(fio,pull->constr_tol);
 +  gmx_fio_do_int(fio,pull->nstxout);
 +  gmx_fio_do_int(fio,pull->nstfout);
 +  if (bRead)
 +    snew(pull->grp,pull->ngrp+1);
 +  for(g=0; g<pull->ngrp+1; g++)
 +    do_pullgrp(fio,&pull->grp[g],bRead,file_version);
 +}
 +
 +
 +static void do_rotgrp(t_fileio *fio, t_rotgrp *rotg,gmx_bool bRead, int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +  int  i;
 +
 +  gmx_fio_do_int(fio,rotg->eType);
 +  gmx_fio_do_int(fio,rotg->bMassW);
 +  gmx_fio_do_int(fio,rotg->nat);
 +  if (bRead)
 +    snew(rotg->ind,rotg->nat);
 +  gmx_fio_ndo_int(fio,rotg->ind,rotg->nat);
 +  if (bRead)
 +      snew(rotg->x_ref,rotg->nat);
 +  gmx_fio_ndo_rvec(fio,rotg->x_ref,rotg->nat);
 +  gmx_fio_do_rvec(fio,rotg->vec);
 +  gmx_fio_do_rvec(fio,rotg->pivot);
 +  gmx_fio_do_real(fio,rotg->rate);
 +  gmx_fio_do_real(fio,rotg->k);
 +  gmx_fio_do_real(fio,rotg->slab_dist);
 +  gmx_fio_do_real(fio,rotg->min_gaussian);
 +  gmx_fio_do_real(fio,rotg->eps);
 +  gmx_fio_do_int(fio,rotg->eFittype);
 +  gmx_fio_do_int(fio,rotg->PotAngle_nstep);
 +  gmx_fio_do_real(fio,rotg->PotAngle_step);
 +}
 +
 +static void do_rot(t_fileio *fio, t_rot *rot,gmx_bool bRead, int file_version)
 +{
 +  int g;
 +
 +  gmx_fio_do_int(fio,rot->ngrp);
 +  gmx_fio_do_int(fio,rot->nstrout);
 +  gmx_fio_do_int(fio,rot->nstsout);
 +  if (bRead)
 +    snew(rot->grp,rot->ngrp);
 +  for(g=0; g<rot->ngrp; g++)
 +    do_rotgrp(fio, &rot->grp[g],bRead,file_version);
 +}
 +
 +
 +static void do_inputrec(t_fileio *fio, t_inputrec *ir,gmx_bool bRead, 
 +                        int file_version, real *fudgeQQ)
 +{
 +    int  i,j,k,*tmp,idum=0; 
 +    gmx_bool bDum=TRUE;
 +    real rdum,bd_temp;
 +    rvec vdum;
 +    gmx_bool bSimAnn;
 +    real zerotemptime,finish_t,init_temp,finish_temp;
 +    
 +    if (file_version != tpx_version)
 +    {
 +        /* Give a warning about features that are not accessible */
 +        fprintf(stderr,"Note: file tpx version %d, software tpx version %d\n",
 +                file_version,tpx_version);
 +    }
 +
 +    if (bRead)
 +    {
 +        init_inputrec(ir);
 +    }
 +
 +    if (file_version == 0)
 +    {
 +        return;
 +    }
 +
 +    /* Basic inputrec stuff */  
 +    gmx_fio_do_int(fio,ir->eI); 
 +    if (file_version >= 62) {
 +      gmx_fio_do_gmx_large_int(fio, ir->nsteps);
 +    } else {
 +      gmx_fio_do_int(fio,idum);
 +      ir->nsteps = idum;
 +    }
 +    if(file_version > 25) {
 +      if (file_version >= 62) {
 +      gmx_fio_do_gmx_large_int(fio, ir->init_step);
 +      } else {
 +      gmx_fio_do_int(fio,idum);
 +      ir->init_step = idum;
 +      }
 +    }  else {
 +      ir->init_step=0;
 +    }
 +
 +      if(file_version >= 58)
 +        gmx_fio_do_int(fio,ir->simulation_part);
 +      else
 +        ir->simulation_part=1;
 +        
 +    if (file_version >= 67) {
 +      gmx_fio_do_int(fio,ir->nstcalcenergy);
 +    } else {
 +      ir->nstcalcenergy = 1;
 +    }
 +    if (file_version < 53) {
 +      /* The pbc info has been moved out of do_inputrec,
 +       * since we always want it, also without reading the inputrec.
 +       */
 +      gmx_fio_do_int(fio,ir->ePBC);
 +      if ((file_version <= 15) && (ir->ePBC == 2))
 +      ir->ePBC = epbcNONE;
 +      if (file_version >= 45) {
 +      gmx_fio_do_int(fio,ir->bPeriodicMols);
 +      } else {
 +      if (ir->ePBC == 2) {
 +        ir->ePBC = epbcXYZ;
 +        ir->bPeriodicMols = TRUE;
 +      } else {
 +      ir->bPeriodicMols = FALSE;
 +      }
 +      }
 +    }
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->cutoff_scheme);
 +    }
 +    else
 +    {
 +        ir->cutoff_scheme = ecutsGROUP;
 +    }
 +    gmx_fio_do_int(fio,ir->ns_type);
 +    gmx_fio_do_int(fio,ir->nstlist);
 +    gmx_fio_do_int(fio,ir->ndelta);
 +    if (file_version < 41) {
 +      gmx_fio_do_int(fio,idum);
 +      gmx_fio_do_int(fio,idum);
 +    }
 +    if (file_version >= 45)
 +      gmx_fio_do_real(fio,ir->rtpi);
 +    else
 +      ir->rtpi = 0.05;
 +    gmx_fio_do_int(fio,ir->nstcomm); 
 +    if (file_version > 34)
 +      gmx_fio_do_int(fio,ir->comm_mode);
 +    else if (ir->nstcomm < 0) 
 +      ir->comm_mode = ecmANGULAR;
 +    else
 +      ir->comm_mode = ecmLINEAR;
 +    ir->nstcomm = abs(ir->nstcomm);
 +    
 +    if(file_version > 25)
 +      gmx_fio_do_int(fio,ir->nstcheckpoint);
 +    else
 +      ir->nstcheckpoint=0;
 +    
 +    gmx_fio_do_int(fio,ir->nstcgsteep); 
 +
 +    if(file_version>=30)
 +      gmx_fio_do_int(fio,ir->nbfgscorr); 
 +    else if (bRead)
 +      ir->nbfgscorr = 10;
 +
 +    gmx_fio_do_int(fio,ir->nstlog); 
 +    gmx_fio_do_int(fio,ir->nstxout); 
 +    gmx_fio_do_int(fio,ir->nstvout); 
 +    gmx_fio_do_int(fio,ir->nstfout); 
 +    gmx_fio_do_int(fio,ir->nstenergy); 
 +    gmx_fio_do_int(fio,ir->nstxtcout); 
 +    if (file_version >= 59) {
 +      gmx_fio_do_double(fio,ir->init_t);
 +      gmx_fio_do_double(fio,ir->delta_t);
 +    } else {
 +      gmx_fio_do_real(fio,rdum);
 +      ir->init_t = rdum;
 +      gmx_fio_do_real(fio,rdum);
 +      ir->delta_t = rdum;
 +    }
 +    gmx_fio_do_real(fio,ir->xtcprec); 
 +    if (file_version < 19) {
 +      gmx_fio_do_int(fio,idum); 
 +      gmx_fio_do_int(fio,idum);
 +    }
 +    if(file_version < 18)
 +      gmx_fio_do_int(fio,idum); 
 +    if (file_version >= 81) {
 +      gmx_fio_do_real(fio,ir->verletbuf_drift);
 +    } else {
 +      ir->verletbuf_drift = 0;
 +    }
 +    gmx_fio_do_real(fio,ir->rlist); 
 +    if (file_version >= 67) {
 +      gmx_fio_do_real(fio,ir->rlistlong);
 +    }
 +    if(file_version >= 82 && file_version != 90)
 +    {
 +        gmx_fio_do_int(fio,ir->nstcalclr);
 +    }
 +    else
 +    {
 +        /* Calculate at NS steps */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    gmx_fio_do_int(fio,ir->coulombtype);
 +    if (file_version < 32 && ir->coulombtype == eelRF)
 +      ir->coulombtype = eelRF_NEC;      
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->coulomb_modifier); 
 +    }
 +    else
 +    {
 +        ir->coulomb_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio,ir->rcoulomb_switch); 
 +    gmx_fio_do_real(fio,ir->rcoulomb); 
 +    gmx_fio_do_int(fio,ir->vdwtype);
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->vdw_modifier); 
 +    }
 +    else
 +    {
 +        ir->vdw_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio,ir->rvdw_switch); 
 +    gmx_fio_do_real(fio,ir->rvdw); 
 +    if (file_version < 67) {
 +      ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +    }
 +    gmx_fio_do_int(fio,ir->eDispCorr); 
 +    gmx_fio_do_real(fio,ir->epsilon_r);
 +    if (file_version >= 37) {
 +      gmx_fio_do_real(fio,ir->epsilon_rf);
 +    } else {
 +      if (EEL_RF(ir->coulombtype)) {
 +      ir->epsilon_rf = ir->epsilon_r;
 +      ir->epsilon_r  = 1.0;
 +      } else {
 +      ir->epsilon_rf = 1.0;
 +      }
 +    }
 +    if (file_version >= 29)
 +      gmx_fio_do_real(fio,ir->tabext);
 +    else
 +      ir->tabext=1.0;
 + 
 +    if(file_version > 25) {
 +      gmx_fio_do_int(fio,ir->gb_algorithm);
 +      gmx_fio_do_int(fio,ir->nstgbradii);
 +      gmx_fio_do_real(fio,ir->rgbradii);
 +      gmx_fio_do_real(fio,ir->gb_saltconc);
 +      gmx_fio_do_int(fio,ir->implicit_solvent);
 +    } else {
 +      ir->gb_algorithm=egbSTILL;
 +      ir->nstgbradii=1;
 +      ir->rgbradii=1.0;
 +      ir->gb_saltconc=0;
 +      ir->implicit_solvent=eisNO;
 +    }
 +      if(file_version>=55)
 +      {
 +              gmx_fio_do_real(fio,ir->gb_epsilon_solvent);
 +              gmx_fio_do_real(fio,ir->gb_obc_alpha);
 +              gmx_fio_do_real(fio,ir->gb_obc_beta);
 +              gmx_fio_do_real(fio,ir->gb_obc_gamma);
 +              if(file_version>=60)
 +              {
 +                      gmx_fio_do_real(fio,ir->gb_dielectric_offset);
 +                      gmx_fio_do_int(fio,ir->sa_algorithm);
 +              }
 +              else
 +              {
 +                      ir->gb_dielectric_offset = 0.009;
 +                      ir->sa_algorithm = esaAPPROX;
 +              }
 +              gmx_fio_do_real(fio,ir->sa_surface_tension);
 +
 +    /* Override sa_surface_tension if it is not changed in the mpd-file */
 +    if(ir->sa_surface_tension<0)
 +    {
 +      if(ir->gb_algorithm==egbSTILL)
 +      {
 +        ir->sa_surface_tension = 0.0049 * 100 * CAL2JOULE;
 +      }
 +      else if(ir->gb_algorithm==egbHCT || ir->gb_algorithm==egbOBC)
 +      {
 +        ir->sa_surface_tension = 0.0054 * 100 * CAL2JOULE;
 +      }
 +    }
 +    
 +      }
 +      else
 +      {
 +              /* Better use sensible values than insane (0.0) ones... */
 +              ir->gb_epsilon_solvent = 80;
 +              ir->gb_obc_alpha       = 1.0;
 +              ir->gb_obc_beta        = 0.8;
 +              ir->gb_obc_gamma       = 4.85;
 +              ir->sa_surface_tension = 2.092;
 +      }
 +
 +       
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_real(fio,ir->fourier_spacing); 
 +    }
 +    else
 +    {
 +        ir->fourier_spacing = 0.0;
 +    }
 +    gmx_fio_do_int(fio,ir->nkx); 
 +    gmx_fio_do_int(fio,ir->nky); 
 +    gmx_fio_do_int(fio,ir->nkz);
 +    gmx_fio_do_int(fio,ir->pme_order);
 +    gmx_fio_do_real(fio,ir->ewald_rtol);
 +
 +    if (file_version >=24) 
 +      gmx_fio_do_int(fio,ir->ewald_geometry);
 +    else
 +      ir->ewald_geometry=eewg3D;
 +
 +    if (file_version <=17) {
 +      ir->epsilon_surface=0;
 +      if (file_version==17)
 +      gmx_fio_do_int(fio,idum);
 +    } 
 +    else
 +      gmx_fio_do_real(fio,ir->epsilon_surface);
 +    
 +    gmx_fio_do_gmx_bool(fio,ir->bOptFFT);
 +
 +    gmx_fio_do_gmx_bool(fio,ir->bContinuation); 
 +    gmx_fio_do_int(fio,ir->etc);
 +    /* before version 18, ir->etc was a gmx_bool (ir->btc),
 +     * but the values 0 and 1 still mean no and
 +     * berendsen temperature coupling, respectively.
 +     */
 +    if (file_version >= 79) {
 +        gmx_fio_do_gmx_bool(fio,ir->bPrintNHChains);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio,ir->nsttcouple);
 +    }
 +    else
 +    {
 +        ir->nsttcouple = ir->nstcalcenergy;
 +    }
 +    if (file_version <= 15)
 +    {
 +        gmx_fio_do_int(fio,idum);
 +    }
 +    if (file_version <=17)
 +    {
 +        gmx_fio_do_int(fio,ir->epct); 
 +        if (file_version <= 15)
 +        {
 +            if (ir->epct == 5)
 +            {
 +                ir->epct = epctSURFACETENSION;
 +            }
 +            gmx_fio_do_int(fio,idum);
 +        }
 +        ir->epct -= 1;
 +        /* we have removed the NO alternative at the beginning */
 +        if(ir->epct==-1)
 +        {
 +            ir->epc=epcNO;
 +            ir->epct=epctISOTROPIC;
 +        } 
 +        else
 +        {
 +            ir->epc=epcBERENDSEN;
 +        }
 +    } 
 +    else
 +    {
 +        gmx_fio_do_int(fio,ir->epc);
 +        gmx_fio_do_int(fio,ir->epct);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio,ir->nstpcouple);
 +    }
 +    else
 +    {
 +        ir->nstpcouple = ir->nstcalcenergy;
 +    }
 +    gmx_fio_do_real(fio,ir->tau_p); 
 +    if (file_version <= 15) {
 +      gmx_fio_do_rvec(fio,vdum);
 +      clear_mat(ir->ref_p);
 +      for(i=0; i<DIM; i++)
 +      ir->ref_p[i][i] = vdum[i];
 +    } else {
 +      gmx_fio_do_rvec(fio,ir->ref_p[XX]);
 +      gmx_fio_do_rvec(fio,ir->ref_p[YY]);
 +      gmx_fio_do_rvec(fio,ir->ref_p[ZZ]);
 +    }
 +    if (file_version <= 15) {
 +      gmx_fio_do_rvec(fio,vdum);
 +      clear_mat(ir->compress);
 +      for(i=0; i<DIM; i++)
 +      ir->compress[i][i] = vdum[i];
 +    } 
 +    else {
 +      gmx_fio_do_rvec(fio,ir->compress[XX]);
 +      gmx_fio_do_rvec(fio,ir->compress[YY]);
 +      gmx_fio_do_rvec(fio,ir->compress[ZZ]);
 +    }
 +    if (file_version >= 47) {
 +      gmx_fio_do_int(fio,ir->refcoord_scaling);
 +      gmx_fio_do_rvec(fio,ir->posres_com);
 +      gmx_fio_do_rvec(fio,ir->posres_comB);
 +    } else {
 +      ir->refcoord_scaling = erscNO;
 +      clear_rvec(ir->posres_com);
 +      clear_rvec(ir->posres_comB);
 +    }
 +    if((file_version > 25) && (file_version < 79))
 +        gmx_fio_do_int(fio,ir->andersen_seed);
 +    else
 +        ir->andersen_seed=0;
 +    if(file_version < 26) {
 +      gmx_fio_do_gmx_bool(fio,bSimAnn); 
 +      gmx_fio_do_real(fio,zerotemptime);
 +    }
 +    
 +    if (file_version < 37)
 +      gmx_fio_do_real(fio,rdum); 
 +
 +    gmx_fio_do_real(fio,ir->shake_tol);
 +    if (file_version < 54)
 +      gmx_fio_do_real(fio,*fudgeQQ);
 +
 +    gmx_fio_do_int(fio,ir->efep);
 +    if (file_version <= 14 && ir->efep != efepNO)
 +    {
 +        ir->efep = efepYES;
 +    }
 +    do_fepvals(fio,ir->fepvals,bRead,file_version);
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio,ir->bSimTemp);
 +        if (ir->bSimTemp) 
 +        {
 +            ir->bSimTemp = TRUE;
 +        }
 +    }
 +    else
 +    {
 +        ir->bSimTemp = FALSE;
 +    }
 +    if (ir->bSimTemp)
 +    {
 +        do_simtempvals(fio,ir->simtempvals,ir->fepvals->n_lambda,bRead,file_version);
 +    }
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio,ir->bExpanded);
 +        if (ir->bExpanded)
 +        {
 +            ir->bExpanded = TRUE;
 +        }
 +        else
 +        {
 +            ir->bExpanded = FALSE;
 +        }
 +    }
 +    if (ir->bExpanded)
 +    {
++        do_expandedvals(fio,ir->expandedvals,ir->fepvals,bRead,file_version);
 +    }
 +    if (file_version >= 57) {
 +      gmx_fio_do_int(fio,ir->eDisre); 
 +    }
 +    gmx_fio_do_int(fio,ir->eDisreWeighting); 
 +    if (file_version < 22) {
 +      if (ir->eDisreWeighting == 0)
 +      ir->eDisreWeighting = edrwEqual;
 +      else
 +      ir->eDisreWeighting = edrwConservative;
 +    }
 +    gmx_fio_do_gmx_bool(fio,ir->bDisreMixed); 
 +    gmx_fio_do_real(fio,ir->dr_fc); 
 +    gmx_fio_do_real(fio,ir->dr_tau); 
 +    gmx_fio_do_int(fio,ir->nstdisreout);
 +    if (file_version >= 22) {
 +      gmx_fio_do_real(fio,ir->orires_fc);
 +      gmx_fio_do_real(fio,ir->orires_tau);
 +      gmx_fio_do_int(fio,ir->nstorireout);
 +    } else {
 +      ir->orires_fc = 0;
 +      ir->orires_tau = 0;
 +      ir->nstorireout = 0;
 +    }
 +    if(file_version >= 26 && file_version < 79) {
 +      gmx_fio_do_real(fio,ir->dihre_fc);
 +      if (file_version < 56) 
 +      {
 +          gmx_fio_do_real(fio,rdum);
 +          gmx_fio_do_int(fio,idum);
 +      }
 +    } else {
 +        ir->dihre_fc=0;
 +    }
 +
 +    gmx_fio_do_real(fio,ir->em_stepsize); 
 +    gmx_fio_do_real(fio,ir->em_tol); 
 +    if (file_version >= 22) 
 +      gmx_fio_do_gmx_bool(fio,ir->bShakeSOR);
 +    else if (bRead)
 +      ir->bShakeSOR = TRUE;
 +    if (file_version >= 11)
 +      gmx_fio_do_int(fio,ir->niter);
 +    else if (bRead) {
 +      ir->niter = 25;
 +      fprintf(stderr,"Note: niter not in run input file, setting it to %d\n",
 +            ir->niter);
 +    }
 +    if (file_version >= 21)
 +      gmx_fio_do_real(fio,ir->fc_stepsize);
 +    else
 +      ir->fc_stepsize = 0;
 +    gmx_fio_do_int(fio,ir->eConstrAlg);
 +    gmx_fio_do_int(fio,ir->nProjOrder);
 +    gmx_fio_do_real(fio,ir->LincsWarnAngle);
 +    if (file_version <= 14)
 +      gmx_fio_do_int(fio,idum);
 +    if (file_version >=26)
 +      gmx_fio_do_int(fio,ir->nLincsIter);
 +    else if (bRead) {
 +      ir->nLincsIter = 1;
 +      fprintf(stderr,"Note: nLincsIter not in run input file, setting it to %d\n",
 +            ir->nLincsIter);
 +    }
 +    if (file_version < 33)
 +      gmx_fio_do_real(fio,bd_temp);
 +    gmx_fio_do_real(fio,ir->bd_fric);
 +    gmx_fio_do_int(fio,ir->ld_seed);
 +    if (file_version >= 33) {
 +      for(i=0; i<DIM; i++)
 +      gmx_fio_do_rvec(fio,ir->deform[i]);
 +    } else {
 +      for(i=0; i<DIM; i++)
 +      clear_rvec(ir->deform[i]);
 +    }
 +    if (file_version >= 14)
 +      gmx_fio_do_real(fio,ir->cos_accel);
 +    else if (bRead)
 +      ir->cos_accel = 0;
 +    gmx_fio_do_int(fio,ir->userint1); 
 +    gmx_fio_do_int(fio,ir->userint2); 
 +    gmx_fio_do_int(fio,ir->userint3); 
 +    gmx_fio_do_int(fio,ir->userint4); 
 +    gmx_fio_do_real(fio,ir->userreal1); 
 +    gmx_fio_do_real(fio,ir->userreal2); 
 +    gmx_fio_do_real(fio,ir->userreal3); 
 +    gmx_fio_do_real(fio,ir->userreal4); 
 +    
 +    /* AdResS stuff */
 +    if (file_version >= 77) {
 +      gmx_fio_do_gmx_bool(fio,ir->bAdress);
 +      if(ir->bAdress){
 +          if (bRead) snew(ir->adress, 1);
 +          gmx_fio_do_int(fio,ir->adress->type);
 +          gmx_fio_do_real(fio,ir->adress->const_wf);
 +          gmx_fio_do_real(fio,ir->adress->ex_width);
 +          gmx_fio_do_real(fio,ir->adress->hy_width);
 +          gmx_fio_do_int(fio,ir->adress->icor);
 +          gmx_fio_do_int(fio,ir->adress->site);
 +          gmx_fio_do_rvec(fio,ir->adress->refs);
 +          gmx_fio_do_int(fio,ir->adress->n_tf_grps);
 +          gmx_fio_do_real(fio, ir->adress->ex_forcecap);
 +          gmx_fio_do_int(fio, ir->adress->n_energy_grps);
 +          gmx_fio_do_int(fio,ir->adress->do_hybridpairs);
 +
 +          if (bRead)snew(ir->adress->tf_table_index,ir->adress->n_tf_grps);
 +          if (ir->adress->n_tf_grps > 0) {
 +            bDum=gmx_fio_ndo_int(fio,ir->adress->tf_table_index,ir->adress->n_tf_grps);
 +          }
 +          if (bRead)snew(ir->adress->group_explicit,ir->adress->n_energy_grps);
 +          if (ir->adress->n_energy_grps > 0) {
 +            bDum=gmx_fio_ndo_int(fio, ir->adress->group_explicit,ir->adress->n_energy_grps);
 +          }
 +      }
 +    } else {
 +      ir->bAdress = FALSE;
 +    }
 +
 +    /* pull stuff */
 +    if (file_version >= 48) {
 +      gmx_fio_do_int(fio,ir->ePull);
 +      if (ir->ePull != epullNO) {
 +      if (bRead)
 +        snew(ir->pull,1);
 +      do_pull(fio, ir->pull,bRead,file_version);
 +      }
 +    } else {
 +      ir->ePull = epullNO;
 +    }
 +    
 +    /* Enforced rotation */
 +    if (file_version >= 74) {
 +        gmx_fio_do_int(fio,ir->bRot);
 +        if (ir->bRot == TRUE) {
 +            if (bRead)
 +                snew(ir->rot,1);
 +            do_rot(fio, ir->rot,bRead,file_version);
 +        }
 +    } else {
 +        ir->bRot = FALSE;
 +    }
 +    
 +    /* grpopts stuff */
 +    gmx_fio_do_int(fio,ir->opts.ngtc); 
 +    if (file_version >= 69) {
 +      gmx_fio_do_int(fio,ir->opts.nhchainlength);
 +    } else {
 +      ir->opts.nhchainlength = 1;
 +    }
 +    gmx_fio_do_int(fio,ir->opts.ngacc); 
 +    gmx_fio_do_int(fio,ir->opts.ngfrz); 
 +    gmx_fio_do_int(fio,ir->opts.ngener);
 +    
 +    if (bRead) {
 +      snew(ir->opts.nrdf,   ir->opts.ngtc); 
 +      snew(ir->opts.ref_t,  ir->opts.ngtc); 
 +      snew(ir->opts.annealing, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_npoints, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_time, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_temp, ir->opts.ngtc); 
 +      snew(ir->opts.tau_t,  ir->opts.ngtc); 
 +      snew(ir->opts.nFreeze,ir->opts.ngfrz); 
 +      snew(ir->opts.acc,    ir->opts.ngacc); 
 +      snew(ir->opts.egp_flags,ir->opts.ngener*ir->opts.ngener);
 +    } 
 +    if (ir->opts.ngtc > 0) {
 +      if (bRead && file_version<13) {
 +      snew(tmp,ir->opts.ngtc);
 +      bDum=gmx_fio_ndo_int(fio,tmp, ir->opts.ngtc);
 +      for(i=0; i<ir->opts.ngtc; i++)
 +        ir->opts.nrdf[i] = tmp[i];
 +      sfree(tmp);
 +      } else {
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.nrdf, ir->opts.ngtc);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.ref_t,ir->opts.ngtc); 
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.tau_t,ir->opts.ngtc); 
 +      if (file_version<33 && ir->eI==eiBD) {
 +      for(i=0; i<ir->opts.ngtc; i++)
 +        ir->opts.tau_t[i] = bd_temp;
 +      }
 +    }
 +    if (ir->opts.ngfrz > 0) 
 +      bDum=gmx_fio_ndo_ivec(fio,ir->opts.nFreeze,ir->opts.ngfrz);
 +    if (ir->opts.ngacc > 0) 
 +      gmx_fio_ndo_rvec(fio,ir->opts.acc,ir->opts.ngacc); 
 +    if (file_version >= 12)
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.egp_flags,
 +                           ir->opts.ngener*ir->opts.ngener);
 +
 +    if(bRead && file_version < 26) {
 +      for(i=0;i<ir->opts.ngtc;i++) {
 +      if(bSimAnn) {
 +        ir->opts.annealing[i] = eannSINGLE;
 +        ir->opts.anneal_npoints[i] = 2;
 +        snew(ir->opts.anneal_time[i],2);
 +        snew(ir->opts.anneal_temp[i],2);
 +        /* calculate the starting/ending temperatures from reft, zerotemptime, and nsteps */
 +        finish_t = ir->init_t + ir->nsteps * ir->delta_t;
 +        init_temp = ir->opts.ref_t[i]*(1-ir->init_t/zerotemptime);
 +        finish_temp = ir->opts.ref_t[i]*(1-finish_t/zerotemptime);
 +        ir->opts.anneal_time[i][0] = ir->init_t;
 +        ir->opts.anneal_time[i][1] = finish_t;
 +        ir->opts.anneal_temp[i][0] = init_temp;
 +        ir->opts.anneal_temp[i][1] = finish_temp;
 +      } else {
 +        ir->opts.annealing[i] = eannNO;
 +        ir->opts.anneal_npoints[i] = 0;
 +      }
 +      }
 +    } else {
 +      /* file version 26 or later */
 +      /* First read the lists with annealing and npoints for each group */
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.annealing,ir->opts.ngtc);
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.anneal_npoints,ir->opts.ngtc);
 +      for(j=0;j<(ir->opts.ngtc);j++) {
 +      k=ir->opts.anneal_npoints[j];
 +      if(bRead) {
 +        snew(ir->opts.anneal_time[j],k);
 +        snew(ir->opts.anneal_temp[j],k);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.anneal_time[j],k);
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.anneal_temp[j],k);
 +      }
 +    }
 +    /* Walls */
 +    if (file_version >= 45) {
 +      gmx_fio_do_int(fio,ir->nwall);
 +      gmx_fio_do_int(fio,ir->wall_type);
 +      if (file_version >= 50)
 +      gmx_fio_do_real(fio,ir->wall_r_linpot);
 +      else
 +      ir->wall_r_linpot = -1;
 +      gmx_fio_do_int(fio,ir->wall_atomtype[0]);
 +      gmx_fio_do_int(fio,ir->wall_atomtype[1]);
 +      gmx_fio_do_real(fio,ir->wall_density[0]);
 +      gmx_fio_do_real(fio,ir->wall_density[1]);
 +      gmx_fio_do_real(fio,ir->wall_ewald_zfac);
 +    } else {
 +      ir->nwall = 0;
 +      ir->wall_type = 0;
 +      ir->wall_atomtype[0] = -1;
 +      ir->wall_atomtype[1] = -1;
 +      ir->wall_density[0] = 0;
 +      ir->wall_density[1] = 0;
 +      ir->wall_ewald_zfac = 3;
 +    }
 +    /* Cosine stuff for electric fields */
 +    for(j=0; (j<DIM); j++) {
 +      gmx_fio_do_int(fio,ir->ex[j].n);
 +      gmx_fio_do_int(fio,ir->et[j].n);
 +      if (bRead) {
 +      snew(ir->ex[j].a,  ir->ex[j].n);
 +      snew(ir->ex[j].phi,ir->ex[j].n);
 +      snew(ir->et[j].a,  ir->et[j].n);
 +      snew(ir->et[j].phi,ir->et[j].n);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->ex[j].a,  ir->ex[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->ex[j].phi,ir->ex[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->et[j].a,  ir->et[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->et[j].phi,ir->et[j].n);
 +    }
 +    
 +    /* QMMM stuff */
 +    if(file_version>=39){
 +      gmx_fio_do_gmx_bool(fio,ir->bQMMM);
 +      gmx_fio_do_int(fio,ir->QMMMscheme);
 +      gmx_fio_do_real(fio,ir->scalefactor);
 +      gmx_fio_do_int(fio,ir->opts.ngQM);
 +      if (bRead) {
 +        snew(ir->opts.QMmethod,    ir->opts.ngQM);
 +        snew(ir->opts.QMbasis,     ir->opts.ngQM);
 +        snew(ir->opts.QMcharge,    ir->opts.ngQM);
 +        snew(ir->opts.QMmult,      ir->opts.ngQM);
 +        snew(ir->opts.bSH,         ir->opts.ngQM);
 +        snew(ir->opts.CASorbitals, ir->opts.ngQM);
 +        snew(ir->opts.CASelectrons,ir->opts.ngQM);
 +        snew(ir->opts.SAon,        ir->opts.ngQM);
 +        snew(ir->opts.SAoff,       ir->opts.ngQM);
 +        snew(ir->opts.SAsteps,     ir->opts.ngQM);
 +        snew(ir->opts.bOPT,        ir->opts.ngQM);
 +        snew(ir->opts.bTS,         ir->opts.ngQM);
 +      }
 +      if (ir->opts.ngQM > 0) {
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMmethod,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMbasis,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMcharge,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMmult,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bSH,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.CASorbitals,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.CASelectrons,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_real(fio,ir->opts.SAon,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_real(fio,ir->opts.SAoff,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.SAsteps,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bOPT,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bTS,ir->opts.ngQM);
 +      }
 +      /* end of QMMM stuff */
 +    }    
 +}
 +
 +
 +static void do_harm(t_fileio *fio, t_iparams *iparams,gmx_bool bRead)
 +{
 +  gmx_fio_do_real(fio,iparams->harmonic.rA);
 +  gmx_fio_do_real(fio,iparams->harmonic.krA);
 +  gmx_fio_do_real(fio,iparams->harmonic.rB);
 +  gmx_fio_do_real(fio,iparams->harmonic.krB);
 +}
 +
 +void do_iparams(t_fileio *fio, t_functype ftype,t_iparams *iparams,
 +                gmx_bool bRead, int file_version)
 +{
 +  int idum;
 +  gmx_bool bDum;
 +  real rdum;
 +  
 +  if (!bRead)
 +    gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +  switch (ftype) {
 +  case F_ANGLES:
 +  case F_G96ANGLES:
 +  case F_BONDS:
 +  case F_G96BONDS:
 +  case F_HARMONIC:
 +  case F_IDIHS:
 +    do_harm(fio, iparams,bRead);
 +    if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && bRead) {
 +      /* Correct incorrect storage of parameters */
 +      iparams->pdihs.phiB = iparams->pdihs.phiA;
 +      iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +    }
 +    break;
 +  case F_LINEAR_ANGLES:
 +    gmx_fio_do_real(fio,iparams->linangle.klinA);
 +    gmx_fio_do_real(fio,iparams->linangle.aA);
 +    gmx_fio_do_real(fio,iparams->linangle.klinB);
 +    gmx_fio_do_real(fio,iparams->linangle.aB);
 +    break;
 +  case F_FENEBONDS:
 +    gmx_fio_do_real(fio,iparams->fene.bm);
 +    gmx_fio_do_real(fio,iparams->fene.kb);
 +    break;
 +  case F_RESTRBONDS:
 +    gmx_fio_do_real(fio,iparams->restraint.lowA);
 +    gmx_fio_do_real(fio,iparams->restraint.up1A);
 +    gmx_fio_do_real(fio,iparams->restraint.up2A);
 +    gmx_fio_do_real(fio,iparams->restraint.kA);
 +    gmx_fio_do_real(fio,iparams->restraint.lowB);
 +    gmx_fio_do_real(fio,iparams->restraint.up1B);
 +    gmx_fio_do_real(fio,iparams->restraint.up2B);
 +    gmx_fio_do_real(fio,iparams->restraint.kB);
 +    break;
 +  case F_TABBONDS:
 +  case F_TABBONDSNC:
 +  case F_TABANGLES:
 +  case F_TABDIHS:
 +    gmx_fio_do_real(fio,iparams->tab.kA);
 +    gmx_fio_do_int(fio,iparams->tab.table);
 +    gmx_fio_do_real(fio,iparams->tab.kB);
 +    break;
 +  case F_CROSS_BOND_BONDS:
 +    gmx_fio_do_real(fio,iparams->cross_bb.r1e);
 +    gmx_fio_do_real(fio,iparams->cross_bb.r2e);
 +    gmx_fio_do_real(fio,iparams->cross_bb.krr);
 +    break;
 +  case F_CROSS_BOND_ANGLES:
 +    gmx_fio_do_real(fio,iparams->cross_ba.r1e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.r2e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.r3e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.krt);
 +    break;
 +  case F_UREY_BRADLEY:
 +    gmx_fio_do_real(fio,iparams->u_b.thetaA);
 +    gmx_fio_do_real(fio,iparams->u_b.kthetaA);
 +    gmx_fio_do_real(fio,iparams->u_b.r13A);
 +    gmx_fio_do_real(fio,iparams->u_b.kUBA);
 +    if (file_version >= 79) {
 +        gmx_fio_do_real(fio,iparams->u_b.thetaB);
 +        gmx_fio_do_real(fio,iparams->u_b.kthetaB);
 +        gmx_fio_do_real(fio,iparams->u_b.r13B);
 +        gmx_fio_do_real(fio,iparams->u_b.kUBB);
 +    } else {
 +        iparams->u_b.thetaB=iparams->u_b.thetaA;
 +        iparams->u_b.kthetaB=iparams->u_b.kthetaA;
 +        iparams->u_b.r13B=iparams->u_b.r13A;
 +        iparams->u_b.kUBB=iparams->u_b.kUBA;
 +    }
 +    break;
 +  case F_QUARTIC_ANGLES:
 +    gmx_fio_do_real(fio,iparams->qangle.theta);
 +    bDum=gmx_fio_ndo_real(fio,iparams->qangle.c,5);
 +    break;
 +  case F_BHAM:
 +    gmx_fio_do_real(fio,iparams->bham.a);
 +    gmx_fio_do_real(fio,iparams->bham.b);
 +    gmx_fio_do_real(fio,iparams->bham.c);
 +    break;
 +  case F_MORSE:
 +    gmx_fio_do_real(fio,iparams->morse.b0A);
 +    gmx_fio_do_real(fio,iparams->morse.cbA);
 +    gmx_fio_do_real(fio,iparams->morse.betaA);
 +    if (file_version >= 79) {
 +        gmx_fio_do_real(fio,iparams->morse.b0B);
 +        gmx_fio_do_real(fio,iparams->morse.cbB);
 +        gmx_fio_do_real(fio,iparams->morse.betaB);
 +    } else {
 +        iparams->morse.b0B = iparams->morse.b0A;
 +        iparams->morse.cbB = iparams->morse.cbA;
 +        iparams->morse.betaB = iparams->morse.betaA;
 +    }
 +    break;
 +  case F_CUBICBONDS:
 +    gmx_fio_do_real(fio,iparams->cubic.b0);
 +    gmx_fio_do_real(fio,iparams->cubic.kb);
 +    gmx_fio_do_real(fio,iparams->cubic.kcub);
 +    break;
 +  case F_CONNBONDS:
 +    break;
 +  case F_POLARIZATION:
 +    gmx_fio_do_real(fio,iparams->polarize.alpha);
 +    break;
 +  case F_ANHARM_POL:
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.alpha);
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.drcut);
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.khyp);
 +    break;
 +  case F_WATER_POL:
 +    if (file_version < 31) 
 +      gmx_fatal(FARGS,"Old tpr files with water_polarization not supported. Make a new.");
 +    gmx_fio_do_real(fio,iparams->wpol.al_x);
 +    gmx_fio_do_real(fio,iparams->wpol.al_y);
 +    gmx_fio_do_real(fio,iparams->wpol.al_z);
 +    gmx_fio_do_real(fio,iparams->wpol.rOH);
 +    gmx_fio_do_real(fio,iparams->wpol.rHH);
 +    gmx_fio_do_real(fio,iparams->wpol.rOD);
 +    break;
 +  case F_THOLE_POL:
 +    gmx_fio_do_real(fio,iparams->thole.a);
 +    gmx_fio_do_real(fio,iparams->thole.alpha1);
 +    gmx_fio_do_real(fio,iparams->thole.alpha2);
 +    gmx_fio_do_real(fio,iparams->thole.rfac);
 +    break;
 +  case F_LJ:
 +    gmx_fio_do_real(fio,iparams->lj.c6);
 +    gmx_fio_do_real(fio,iparams->lj.c12);
 +    break;
 +  case F_LJ14:
 +    gmx_fio_do_real(fio,iparams->lj14.c6A);
 +    gmx_fio_do_real(fio,iparams->lj14.c12A);
 +    gmx_fio_do_real(fio,iparams->lj14.c6B);
 +    gmx_fio_do_real(fio,iparams->lj14.c12B);
 +    break;
 +  case F_LJC14_Q:
 +    gmx_fio_do_real(fio,iparams->ljc14.fqq);
 +    gmx_fio_do_real(fio,iparams->ljc14.qi);
 +    gmx_fio_do_real(fio,iparams->ljc14.qj);
 +    gmx_fio_do_real(fio,iparams->ljc14.c6);
 +    gmx_fio_do_real(fio,iparams->ljc14.c12);
 +    break;
 +  case F_LJC_PAIRS_NB:
 +    gmx_fio_do_real(fio,iparams->ljcnb.qi);
 +    gmx_fio_do_real(fio,iparams->ljcnb.qj);
 +    gmx_fio_do_real(fio,iparams->ljcnb.c6);
 +    gmx_fio_do_real(fio,iparams->ljcnb.c12);
 +    break;
 +  case F_PDIHS:
 +  case F_PIDIHS:
 +  case F_ANGRES:
 +  case F_ANGRESZ:
 +    gmx_fio_do_real(fio,iparams->pdihs.phiA);
 +    gmx_fio_do_real(fio,iparams->pdihs.cpA);
 +    if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && file_version < 42) {
 +      /* Read the incorrectly stored multiplicity */
 +      gmx_fio_do_real(fio,iparams->harmonic.rB);
 +      gmx_fio_do_real(fio,iparams->harmonic.krB);
 +      iparams->pdihs.phiB = iparams->pdihs.phiA;
 +      iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +    } else {
 +      gmx_fio_do_real(fio,iparams->pdihs.phiB);
 +      gmx_fio_do_real(fio,iparams->pdihs.cpB);
 +      gmx_fio_do_int(fio,iparams->pdihs.mult);
 +    }
 +    break;
 +  case F_DISRES:
 +    gmx_fio_do_int(fio,iparams->disres.label);
 +    gmx_fio_do_int(fio,iparams->disres.type);
 +    gmx_fio_do_real(fio,iparams->disres.low);
 +    gmx_fio_do_real(fio,iparams->disres.up1);
 +    gmx_fio_do_real(fio,iparams->disres.up2);
 +    gmx_fio_do_real(fio,iparams->disres.kfac);
 +    break;
 +  case F_ORIRES:
 +    gmx_fio_do_int(fio,iparams->orires.ex);
 +    gmx_fio_do_int(fio,iparams->orires.label);
 +    gmx_fio_do_int(fio,iparams->orires.power);
 +    gmx_fio_do_real(fio,iparams->orires.c);
 +    gmx_fio_do_real(fio,iparams->orires.obs);
 +    gmx_fio_do_real(fio,iparams->orires.kfac);
 +    break;
 +  case F_DIHRES:
 +    if ( file_version < 72) {
 +        gmx_fio_do_int(fio,idum);
 +        gmx_fio_do_int(fio,idum);
 +    }
 +    gmx_fio_do_real(fio,iparams->dihres.phiA);
 +    gmx_fio_do_real(fio,iparams->dihres.dphiA);
 +    gmx_fio_do_real(fio,iparams->dihres.kfacA);
 +    if (file_version >= 72) {
 +        gmx_fio_do_real(fio,iparams->dihres.phiB);
 +        gmx_fio_do_real(fio,iparams->dihres.dphiB);
 +        gmx_fio_do_real(fio,iparams->dihres.kfacB);
 +    } else {
 +        iparams->dihres.phiB=iparams->dihres.phiA;
 +        iparams->dihres.dphiB=iparams->dihres.dphiA;
 +        iparams->dihres.kfacB=iparams->dihres.kfacA;
 +    }
 +    break;
 +  case F_POSRES:
 +    gmx_fio_do_rvec(fio,iparams->posres.pos0A);
 +    gmx_fio_do_rvec(fio,iparams->posres.fcA);
 +    if (bRead && file_version < 27) {
 +      copy_rvec(iparams->posres.pos0A,iparams->posres.pos0B);
 +      copy_rvec(iparams->posres.fcA,iparams->posres.fcB);
 +    } else {
 +      gmx_fio_do_rvec(fio,iparams->posres.pos0B);
 +      gmx_fio_do_rvec(fio,iparams->posres.fcB);
 +    }
 +    break;
 +  case F_FBPOSRES:
 +      gmx_fio_do_int(fio,iparams->fbposres.geom);
 +      gmx_fio_do_rvec(fio,iparams->fbposres.pos0);
 +      gmx_fio_do_real(fio,iparams->fbposres.r);
 +      gmx_fio_do_real(fio,iparams->fbposres.k);
 +      break;
 +  case F_RBDIHS:
 +    bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcA,NR_RBDIHS);
 +    if(file_version>=25) 
 +      bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcB,NR_RBDIHS);
 +    break;
 +  case F_FOURDIHS:
 +    /* Fourier dihedrals are internally represented
 +     * as Ryckaert-Bellemans since those are faster to compute.
 +     */
 +     bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcA, NR_RBDIHS);
 +     bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcB, NR_RBDIHS);
 +    break;
 +  case F_CONSTR:
 +  case F_CONSTRNC:
 +    gmx_fio_do_real(fio,iparams->constr.dA);
 +    gmx_fio_do_real(fio,iparams->constr.dB);
 +    break;
 +  case F_SETTLE:
 +    gmx_fio_do_real(fio,iparams->settle.doh);
 +    gmx_fio_do_real(fio,iparams->settle.dhh);
 +    break;
 +  case F_VSITE2:
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    break;
 +  case F_VSITE3:
 +  case F_VSITE3FD:
 +  case F_VSITE3FAD:
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    gmx_fio_do_real(fio,iparams->vsite.b);
 +    break;
 +  case F_VSITE3OUT:
 +  case F_VSITE4FD: 
 +  case F_VSITE4FDN: 
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    gmx_fio_do_real(fio,iparams->vsite.b);
 +    gmx_fio_do_real(fio,iparams->vsite.c);
 +    break;
 +  case F_VSITEN:
 +    gmx_fio_do_int(fio,iparams->vsiten.n);
 +    gmx_fio_do_real(fio,iparams->vsiten.a);
 +    break;
 +  case F_GB12:
 +  case F_GB13:
 +  case F_GB14:
 +    /* We got rid of some parameters in version 68 */
 +    if(bRead && file_version<68)
 +    {
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +    }
 +      gmx_fio_do_real(fio,iparams->gb.sar);   
 +      gmx_fio_do_real(fio,iparams->gb.st);
 +      gmx_fio_do_real(fio,iparams->gb.pi);
 +      gmx_fio_do_real(fio,iparams->gb.gbr);
 +      gmx_fio_do_real(fio,iparams->gb.bmlt);
 +      break;
 +  case F_CMAP:
 +      gmx_fio_do_int(fio,iparams->cmap.cmapA);
 +      gmx_fio_do_int(fio,iparams->cmap.cmapB);
 +    break;
 +  default:
 +      gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
 +                ftype,interaction_function[ftype].name,__FILE__,__LINE__);
 +  }
 +  if (!bRead)
 +    gmx_fio_unset_comment(fio);
 +}
 +
 +static void do_ilist(t_fileio *fio, t_ilist *ilist,gmx_bool bRead,int file_version,
 +                   int ftype)
 +{
 +  int  i,k,idum;
 +  gmx_bool bDum=TRUE;
 +  
 +  if (!bRead) {
 +    gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +  }
 +  if (file_version < 44) {
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  }
 +  gmx_fio_do_int(fio,ilist->nr);
 +  if (bRead)
 +    snew(ilist->iatoms,ilist->nr);
 +  bDum=gmx_fio_ndo_int(fio,ilist->iatoms,ilist->nr);
 +  if (!bRead)
 +    gmx_fio_unset_comment(fio);
 +}
 +
 +static void do_ffparams(t_fileio *fio, gmx_ffparams_t *ffparams,
 +                      gmx_bool bRead, int file_version)
 +{
 +  int  idum,i,j;
 +  gmx_bool bDum=TRUE;
 +  unsigned int k;
 +
 +  gmx_fio_do_int(fio,ffparams->atnr);
 +  if (file_version < 57) {
 +    gmx_fio_do_int(fio,idum);
 +  }
 +  gmx_fio_do_int(fio,ffparams->ntypes);
 +  if (bRead && debug)
 +    fprintf(debug,"ffparams->atnr = %d, ntypes = %d\n",
 +          ffparams->atnr,ffparams->ntypes);
 +  if (bRead) {
 +    snew(ffparams->functype,ffparams->ntypes);
 +    snew(ffparams->iparams,ffparams->ntypes);
 +  }
 +  /* Read/write all the function types */
 +  bDum=gmx_fio_ndo_int(fio,ffparams->functype,ffparams->ntypes);
 +  if (bRead && debug)
 +    pr_ivec(debug,0,"functype",ffparams->functype,ffparams->ntypes,TRUE);
 +
 +  if (file_version >= 66) {
 +    gmx_fio_do_double(fio,ffparams->reppow);
 +  } else {
 +    ffparams->reppow = 12.0;
 +  }
 +
 +  if (file_version >= 57) {
 +    gmx_fio_do_real(fio,ffparams->fudgeQQ);
 +  }
 +
 +  /* Check whether all these function types are supported by the code.
 +   * In practice the code is backwards compatible, which means that the
 +   * numbering may have to be altered from old numbering to new numbering
 +   */
 +  for (i=0; (i<ffparams->ntypes); i++) {
 +    if (bRead)
 +      /* Loop over file versions */
 +      for (k=0; (k<NFTUPD); k++)
 +      /* Compare the read file_version to the update table */
 +      if ((file_version < ftupd[k].fvnr) && 
 +          (ffparams->functype[i] >= ftupd[k].ftype)) {
 +        ffparams->functype[i] += 1;
 +        if (debug) {
 +          fprintf(debug,"Incrementing function type %d to %d (due to %s)\n",
 +                  i,ffparams->functype[i],
 +                  interaction_function[ftupd[k].ftype].longname);
 +          fflush(debug);
 +        }
 +      }
 +    
 +    do_iparams(fio, ffparams->functype[i],&ffparams->iparams[i],bRead,
 +               file_version);
 +    if (bRead && debug)
 +      pr_iparams(debug,ffparams->functype[i],&ffparams->iparams[i]);
 +  }
 +}
 +
 +static void add_settle_atoms(t_ilist *ilist)
 +{
 +    int i;
 +
 +    /* Settle used to only store the first atom: add the other two */
 +    srenew(ilist->iatoms,2*ilist->nr);
 +    for(i=ilist->nr/2-1; i>=0; i--)
 +    {
 +        ilist->iatoms[4*i+0] = ilist->iatoms[2*i+0];
 +        ilist->iatoms[4*i+1] = ilist->iatoms[2*i+1];
 +        ilist->iatoms[4*i+2] = ilist->iatoms[2*i+1] + 1;
 +        ilist->iatoms[4*i+3] = ilist->iatoms[2*i+1] + 2;
 +    }
 +    ilist->nr = 2*ilist->nr;
 +}
 +
 +static void do_ilists(t_fileio *fio, t_ilist *ilist,gmx_bool bRead, 
 +                      int file_version)
 +{
 +  int i,j,renum[F_NRE];
 +  gmx_bool bDum=TRUE,bClear;
 +  unsigned int k;
 +  
 +  for(j=0; (j<F_NRE); j++) {
 +    bClear = FALSE;
 +    if (bRead)
 +      for (k=0; k<NFTUPD; k++)
 +        if ((file_version < ftupd[k].fvnr) && (j == ftupd[k].ftype)) 
 +          bClear = TRUE;
 +    if (bClear) {
 +      ilist[j].nr = 0;
 +      ilist[j].iatoms = NULL;
 +    } else {
 +      do_ilist(fio, &ilist[j],bRead,file_version,j);
 +      if (file_version < 78 && j == F_SETTLE && ilist[j].nr > 0)
 +      {
 +          add_settle_atoms(&ilist[j]);
 +      }
 +    }
 +    /*
 +    if (bRead && gmx_debug_at)
 +      pr_ilist(debug,0,interaction_function[j].longname,
 +             functype,&ilist[j],TRUE);
 +    */
 +  }
 +}
 +
 +static void do_idef(t_fileio *fio, gmx_ffparams_t *ffparams,gmx_moltype_t *molt,
 +                  gmx_bool bRead, int file_version)
 +{
 +  do_ffparams(fio, ffparams,bRead,file_version);
 +    
 +  if (file_version >= 54) {
 +    gmx_fio_do_real(fio,ffparams->fudgeQQ);
 +  }
 +
 +  do_ilists(fio, molt->ilist,bRead,file_version);
 +}
 +
 +static void do_block(t_fileio *fio, t_block *block,gmx_bool bRead,int file_version)
 +{
 +  int  i,idum,dum_nra,*dum_a;
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version < 44)
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  gmx_fio_do_int(fio,block->nr);
 +  if (file_version < 51)
 +    gmx_fio_do_int(fio,dum_nra);
 +  if (bRead) {
 +    block->nalloc_index = block->nr+1;
 +    snew(block->index,block->nalloc_index);
 +  }
 +  bDum=gmx_fio_ndo_int(fio,block->index,block->nr+1);
 +
 +  if (file_version < 51 && dum_nra > 0) {
 +    snew(dum_a,dum_nra);
 +    bDum=gmx_fio_ndo_int(fio,dum_a,dum_nra);
 +    sfree(dum_a);
 +  }
 +}
 +
 +static void do_blocka(t_fileio *fio, t_blocka *block,gmx_bool bRead,
 +                      int file_version)
 +{
 +  int  i,idum;
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version < 44)
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  gmx_fio_do_int(fio,block->nr);
 +  gmx_fio_do_int(fio,block->nra);
 +  if (bRead) {
 +    block->nalloc_index = block->nr+1;
 +    snew(block->index,block->nalloc_index);
 +    block->nalloc_a = block->nra;
 +    snew(block->a,block->nalloc_a);
 +  }
 +  bDum=gmx_fio_ndo_int(fio,block->index,block->nr+1);
 +  bDum=gmx_fio_ndo_int(fio,block->a,block->nra);
 +}
 +
 +static void do_atom(t_fileio *fio, t_atom *atom,int ngrp,gmx_bool bRead, 
 +                    int file_version, gmx_groups_t *groups,int atnr)
 +{ 
 +  int i,myngrp;
 +  
 +  gmx_fio_do_real(fio,atom->m);
 +  gmx_fio_do_real(fio,atom->q);
 +  gmx_fio_do_real(fio,atom->mB);
 +  gmx_fio_do_real(fio,atom->qB);
 +  gmx_fio_do_ushort(fio, atom->type);
 +  gmx_fio_do_ushort(fio, atom->typeB);
 +  gmx_fio_do_int(fio,atom->ptype);
 +  gmx_fio_do_int(fio,atom->resind);
 +  if (file_version >= 52)
 +    gmx_fio_do_int(fio,atom->atomnumber);
 +  else if (bRead)
 +    atom->atomnumber = NOTSET;
 +  if (file_version < 23) 
 +    myngrp = 8;
 +  else if (file_version < 39) 
 +    myngrp = 9;
 +  else
 +    myngrp = ngrp;
 +
 +  if (file_version < 57) {
 +    unsigned char uchar[egcNR];
 +    gmx_fio_ndo_uchar(fio,uchar,myngrp);
 +    for(i=myngrp; (i<ngrp); i++) {
 +      uchar[i] = 0;
 +    }
 +    /* Copy the old data format to the groups struct */
 +    for(i=0; i<ngrp; i++) {
 +      groups->grpnr[i][atnr] = uchar[i];
 +    }
 +  }
 +}
 +
 +static void do_grps(t_fileio *fio, int ngrp,t_grps grps[],gmx_bool bRead, 
 +                    int file_version)
 +{
 +  int i,j,myngrp;
 +  gmx_bool bDum=TRUE;
 +  
 +  if (file_version < 23) 
 +    myngrp = 8;
 +  else if (file_version < 39) 
 +    myngrp = 9;
 +  else
 +    myngrp = ngrp;
 +
 +  for(j=0; (j<ngrp); j++) {
 +    if (j<myngrp) {
 +      gmx_fio_do_int(fio,grps[j].nr);
 +      if (bRead)
 +      snew(grps[j].nm_ind,grps[j].nr);
 +      bDum=gmx_fio_ndo_int(fio,grps[j].nm_ind,grps[j].nr);
 +    }
 +    else {
 +      grps[j].nr = 1;
 +      snew(grps[j].nm_ind,grps[j].nr);
 +    }
 +  }
 +}
 +
 +static void do_symstr(t_fileio *fio, char ***nm,gmx_bool bRead,t_symtab *symtab)
 +{
 +  int ls;
 +  
 +  if (bRead) {
 +    gmx_fio_do_int(fio,ls);
 +    *nm = get_symtab_handle(symtab,ls);
 +  }
 +  else {
 +    ls = lookup_symtab(symtab,*nm);
 +    gmx_fio_do_int(fio,ls);
 +  }
 +}
 +
 +static void do_strstr(t_fileio *fio, int nstr,char ***nm,gmx_bool bRead,
 +                      t_symtab *symtab)
 +{
 +  int  j;
 +  
 +  for (j=0; (j<nstr); j++) 
 +    do_symstr(fio, &(nm[j]),bRead,symtab);
 +}
 +
 +static void do_resinfo(t_fileio *fio, int n,t_resinfo *ri,gmx_bool bRead,
 +                       t_symtab *symtab, int file_version)
 +{
 +  int  j;
 +  
 +  for (j=0; (j<n); j++) {
 +    do_symstr(fio, &(ri[j].name),bRead,symtab);
 +    if (file_version >= 63) {
 +      gmx_fio_do_int(fio,ri[j].nr);
 +      gmx_fio_do_uchar(fio, ri[j].ic);
 +    } else {
 +      ri[j].nr = j + 1;
 +      ri[j].ic = ' ';
 +    }
 +  }
 +}
 +
 +static void do_atoms(t_fileio *fio, t_atoms *atoms,gmx_bool bRead,t_symtab *symtab,
 +                   int file_version,
 +                   gmx_groups_t *groups)
 +{
 +  int i;
 +  
 +  gmx_fio_do_int(fio,atoms->nr);
 +  gmx_fio_do_int(fio,atoms->nres);
 +  if (file_version < 57) {
 +    gmx_fio_do_int(fio,groups->ngrpname);
 +    for(i=0; i<egcNR; i++) {
 +      groups->ngrpnr[i] = atoms->nr;
 +      snew(groups->grpnr[i],groups->ngrpnr[i]);
 +    }
 +  }
 +  if (bRead) {
 +    snew(atoms->atom,atoms->nr);
 +    snew(atoms->atomname,atoms->nr);
 +    snew(atoms->atomtype,atoms->nr);
 +    snew(atoms->atomtypeB,atoms->nr);
 +    snew(atoms->resinfo,atoms->nres);
 +    if (file_version < 57) {
 +      snew(groups->grpname,groups->ngrpname);
 +    }
 +    atoms->pdbinfo = NULL;
 +  }
 +  for(i=0; (i<atoms->nr); i++) {
 +    do_atom(fio, &atoms->atom[i],egcNR,bRead, file_version,groups,i);
 +  }
 +  do_strstr(fio, atoms->nr,atoms->atomname,bRead,symtab);
 +  if (bRead && (file_version <= 20)) {
 +    for(i=0; i<atoms->nr; i++) {
 +      atoms->atomtype[i]  = put_symtab(symtab,"?");
 +      atoms->atomtypeB[i] = put_symtab(symtab,"?");
 +    }
 +  } else {
 +    do_strstr(fio, atoms->nr,atoms->atomtype,bRead,symtab);
 +    do_strstr(fio, atoms->nr,atoms->atomtypeB,bRead,symtab);
 +  }
 +  do_resinfo(fio, atoms->nres,atoms->resinfo,bRead,symtab,file_version);
 +
 +  if (file_version < 57) {
 +    do_strstr(fio, groups->ngrpname,groups->grpname,bRead,symtab);
 +  
 +    do_grps(fio, egcNR,groups->grps,bRead,file_version);
 +  }
 +}
 +
 +static void do_groups(t_fileio *fio, gmx_groups_t *groups,
 +                    gmx_bool bRead,t_symtab *symtab,
 +                    int file_version)
 +{
 +  int  g,n,i;
 +  gmx_bool bDum=TRUE;
 +
 +  do_grps(fio, egcNR,groups->grps,bRead,file_version);
 +  gmx_fio_do_int(fio,groups->ngrpname);
 +  if (bRead) {
 +    snew(groups->grpname,groups->ngrpname);
 +  }
 +  do_strstr(fio, groups->ngrpname,groups->grpname,bRead,symtab);
 +  for(g=0; g<egcNR; g++) {
 +    gmx_fio_do_int(fio,groups->ngrpnr[g]);
 +    if (groups->ngrpnr[g] == 0) {
 +      if (bRead) {
 +      groups->grpnr[g] = NULL;
 +      }
 +    } else {
 +      if (bRead) {
 +      snew(groups->grpnr[g],groups->ngrpnr[g]);
 +      }
 +      bDum=gmx_fio_ndo_uchar(fio, groups->grpnr[g],groups->ngrpnr[g]);
 +    }
 +  }
 +}
 +
 +static void do_atomtypes(t_fileio *fio, t_atomtypes *atomtypes,gmx_bool bRead,
 +                       t_symtab *symtab,int file_version)
 +{
 +  int i,j;
 +  gmx_bool bDum = TRUE;
 +  
 +  if (file_version > 25) {
 +    gmx_fio_do_int(fio,atomtypes->nr);
 +    j=atomtypes->nr;
 +    if (bRead) {
 +      snew(atomtypes->radius,j);
 +      snew(atomtypes->vol,j);
 +      snew(atomtypes->surftens,j);
 +      snew(atomtypes->atomnumber,j);
 +      snew(atomtypes->gb_radius,j);
 +      snew(atomtypes->S_hct,j);
 +    }
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->radius,j);
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->vol,j);
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->surftens,j);
 +    if(file_version >= 40)
 +    {
 +        bDum=gmx_fio_ndo_int(fio,atomtypes->atomnumber,j);
 +    }
 +      if(file_version >= 60)
 +      {
 +              bDum=gmx_fio_ndo_real(fio,atomtypes->gb_radius,j);
 +              bDum=gmx_fio_ndo_real(fio,atomtypes->S_hct,j);
 +      }
 +  } else {
 +    /* File versions prior to 26 cannot do GBSA, 
 +     * so they dont use this structure 
 +     */
 +    atomtypes->nr = 0;
 +    atomtypes->radius = NULL;
 +    atomtypes->vol = NULL;
 +    atomtypes->surftens = NULL;
 +    atomtypes->atomnumber = NULL;
 +    atomtypes->gb_radius = NULL;
 +    atomtypes->S_hct = NULL;
 +  }  
 +}
 +
 +static void do_symtab(t_fileio *fio, t_symtab *symtab,gmx_bool bRead)
 +{
 +  int i,nr;
 +  t_symbuf *symbuf;
 +  char buf[STRLEN];
 +  
 +  gmx_fio_do_int(fio,symtab->nr);
 +  nr     = symtab->nr;
 +  if (bRead) {
 +    snew(symtab->symbuf,1);
 +    symbuf = symtab->symbuf;
 +    symbuf->bufsize = nr;
 +    snew(symbuf->buf,nr);
 +    for (i=0; (i<nr); i++) {
 +      gmx_fio_do_string(fio,buf);
 +      symbuf->buf[i]=strdup(buf);
 +    }
 +  }
 +  else {
 +    symbuf = symtab->symbuf;
 +    while (symbuf!=NULL) {
 +      for (i=0; (i<symbuf->bufsize) && (i<nr); i++) 
 +      gmx_fio_do_string(fio,symbuf->buf[i]);
 +      nr-=i;
 +      symbuf=symbuf->next;
 +    }
 +    if (nr != 0)
 +      gmx_fatal(FARGS,"nr of symtab strings left: %d",nr);
 +  }
 +}
 +
 +static void do_cmap(t_fileio *fio, gmx_cmap_t *cmap_grid, gmx_bool bRead)
 +{
 +      int i,j,ngrid,gs,nelem;
 +      
 +      gmx_fio_do_int(fio,cmap_grid->ngrid);
 +      gmx_fio_do_int(fio,cmap_grid->grid_spacing);
 +      
 +      ngrid = cmap_grid->ngrid;
 +      gs    = cmap_grid->grid_spacing;
 +      nelem = gs * gs;
 +      
 +      if(bRead)
 +      {
 +              snew(cmap_grid->cmapdata,ngrid);
 +              
 +              for(i=0;i<cmap_grid->ngrid;i++)
 +              {
 +                      snew(cmap_grid->cmapdata[i].cmap,4*nelem);
 +              }
 +      }
 +      
 +      for(i=0;i<cmap_grid->ngrid;i++)
 +      {
 +              for(j=0;j<nelem;j++)
 +              {
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+1]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+2]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+3]);
 +              }
 +      }       
 +}
 +
 +
 +void tpx_make_chain_identifiers(t_atoms *atoms,t_block *mols)
 +{
 +    int m,a,a0,a1,r;
 +    char c,chainid;
 +    int  chainnum;
 +    
 +    /* We always assign a new chain number, but save the chain id characters 
 +     * for larger molecules.
 +     */
 +#define CHAIN_MIN_ATOMS 15
 +    
 +    chainnum=0;
 +    chainid='A';
 +    for(m=0; m<mols->nr; m++) 
 +    {
 +        a0=mols->index[m];
 +        a1=mols->index[m+1];
 +        if ((a1-a0 >= CHAIN_MIN_ATOMS) && (chainid <= 'Z')) 
 +        {
 +            c=chainid;
 +            chainid++;
 +        } 
 +        else
 +        {
 +            c=' ';
 +        }
 +        for(a=a0; a<a1; a++) 
 +        {
 +            atoms->resinfo[atoms->atom[a].resind].chainnum = chainnum;
 +            atoms->resinfo[atoms->atom[a].resind].chainid  = c;
 +        }
 +        chainnum++;
 +    }
 +    
 +    /* Blank out the chain id if there was only one chain */
 +    if (chainid == 'B') 
 +    {
 +        for(r=0; r<atoms->nres; r++) 
 +        {
 +            atoms->resinfo[r].chainid = ' ';
 +        }
 +    }
 +}
 +  
 +static void do_moltype(t_fileio *fio, gmx_moltype_t *molt,gmx_bool bRead,
 +                       t_symtab *symtab, int file_version,
 +                     gmx_groups_t *groups)
 +{
 +  int i;
 +
 +  if (file_version >= 57) {
 +    do_symstr(fio, &(molt->name),bRead,symtab);
 +  }
 +
 +  do_atoms(fio, &molt->atoms, bRead, symtab, file_version, groups);
 +
 +  if (bRead && gmx_debug_at) {
 +    pr_atoms(debug,0,"atoms",&molt->atoms,TRUE);
 +  }
 +  
 +  if (file_version >= 57) {
 +    do_ilists(fio, molt->ilist,bRead,file_version);
 +
 +    do_block(fio, &molt->cgs,bRead,file_version);
 +    if (bRead && gmx_debug_at) {
 +      pr_block(debug,0,"cgs",&molt->cgs,TRUE);
 +    }
 +  }
 +
 +  /* This used to be in the atoms struct */
 +  do_blocka(fio, &molt->excls, bRead, file_version);
 +}
 +
 +static void do_molblock(t_fileio *fio, gmx_molblock_t *molb,gmx_bool bRead,
 +                        int file_version)
 +{
 +  int i;
 +
 +  gmx_fio_do_int(fio,molb->type);
 +  gmx_fio_do_int(fio,molb->nmol);
 +  gmx_fio_do_int(fio,molb->natoms_mol);
 +  /* Position restraint coordinates */
 +  gmx_fio_do_int(fio,molb->nposres_xA);
 +  if (molb->nposres_xA > 0) {
 +    if (bRead) {
 +      snew(molb->posres_xA,molb->nposres_xA);
 +    }
 +    gmx_fio_ndo_rvec(fio,molb->posres_xA,molb->nposres_xA);
 +  }
 +  gmx_fio_do_int(fio,molb->nposres_xB);
 +  if (molb->nposres_xB > 0) {
 +    if (bRead) {
 +      snew(molb->posres_xB,molb->nposres_xB);
 +    }
 +    gmx_fio_ndo_rvec(fio,molb->posres_xB,molb->nposres_xB);
 +  }
 +
 +}
 +
 +static t_block mtop_mols(gmx_mtop_t *mtop)
 +{
 +  int mb,m,a,mol;
 +  t_block mols;
 +
 +  mols.nr = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    mols.nr += mtop->molblock[mb].nmol;
 +  }
 +  mols.nalloc_index = mols.nr + 1;
 +  snew(mols.index,mols.nalloc_index);
 +
 +  a = 0;
 +  m = 0;
 +  mols.index[m] = a;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    for(mol=0; mol<mtop->molblock[mb].nmol; mol++) {
 +      a += mtop->molblock[mb].natoms_mol;
 +      m++;
 +      mols.index[m] = a;
 +    }
 +  }
 +  
 +  return mols;
 +}
 +
 +static void add_posres_molblock(gmx_mtop_t *mtop)
 +{
 +    t_ilist *il,*ilfb;
 +  int am,i,mol,a;
 +  gmx_bool bFE;
 +  gmx_molblock_t *molb;
 +  t_iparams *ip;
 +
 +  /* posres reference positions are stored in ip->posres (if present) and
 +     in ip->fbposres (if present). If normal and flat-bottomed posres are present,
 +     posres.pos0A are identical to fbposres.pos0. */
 +  il = &mtop->moltype[0].ilist[F_POSRES];
 +  ilfb = &mtop->moltype[0].ilist[F_FBPOSRES];
 +  if (il->nr == 0 && ilfb->nr == 0) {
 +    return;
 +  }
 +  am = 0;
 +  bFE = FALSE;
 +  for(i=0; i<il->nr; i+=2) {
 +    ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +    am = max(am,il->iatoms[i+1]);
 +    if (ip->posres.pos0B[XX] != ip->posres.pos0A[XX] ||
 +      ip->posres.pos0B[YY] != ip->posres.pos0A[YY] ||
 +      ip->posres.pos0B[ZZ] != ip->posres.pos0A[ZZ]) {
 +      bFE = TRUE;
 +    }
 +  }
 +  /* This loop is required if we have only flat-bottomed posres:
 +     - set am
 +     - bFE == FALSE (no B-state for flat-bottomed posres) */
 +  if (il->nr == 0)
 +  {
 +      for(i=0; i<ilfb->nr; i+=2) {
 +          ip = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +          am = max(am,ilfb->iatoms[i+1]);
 +      }
 +  }
 +  /* Make the posres coordinate block end at a molecule end */
 +  mol = 0;
 +  while(am >= mtop->mols.index[mol+1]) {
 +    mol++;
 +  }
 +  molb = &mtop->molblock[0];
 +  molb->nposres_xA = mtop->mols.index[mol+1];
 +  snew(molb->posres_xA,molb->nposres_xA);
 +  if (bFE) {
 +    molb->nposres_xB = molb->nposres_xA;
 +    snew(molb->posres_xB,molb->nposres_xB);
 +  } else {
 +    molb->nposres_xB = 0;
 +  }
 +  for(i=0; i<il->nr; i+=2) {
 +    ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +    a  = il->iatoms[i+1];
 +    molb->posres_xA[a][XX] = ip->posres.pos0A[XX];
 +    molb->posres_xA[a][YY] = ip->posres.pos0A[YY];
 +    molb->posres_xA[a][ZZ] = ip->posres.pos0A[ZZ];
 +    if (bFE) {
 +      molb->posres_xB[a][XX] = ip->posres.pos0B[XX];
 +      molb->posres_xB[a][YY] = ip->posres.pos0B[YY];
 +      molb->posres_xB[a][ZZ] = ip->posres.pos0B[ZZ];
 +    }
 +  }
 +  if (il->nr == 0)
 +  {
 +      /* If only flat-bottomed posres are present, take reference pos from them.
 +         Here: bFE == FALSE      */
 +      for(i=0; i<ilfb->nr; i+=2)
 +      {
 +          ip = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +          a  = ilfb->iatoms[i+1];
 +          molb->posres_xA[a][XX] = ip->fbposres.pos0[XX];
 +          molb->posres_xA[a][YY] = ip->fbposres.pos0[YY];
 +          molb->posres_xA[a][ZZ] = ip->fbposres.pos0[ZZ];
 +      }
 +  }
 +}
 +
 +static void set_disres_npair(gmx_mtop_t *mtop)
 +{
 +  int mt,i,npair;
 +  t_iparams *ip;
 +  t_ilist *il;
 +  t_iatom *a;
 +
 +  ip = mtop->ffparams.iparams;
 +
 +  for(mt=0; mt<mtop->nmoltype; mt++) {
 +    il = &mtop->moltype[mt].ilist[F_DISRES];
 +    if (il->nr > 0) {
 +      a = il->iatoms;
 +      npair = 0;
 +      for(i=0; i<il->nr; i+=3) {
 +      npair++;
 +      if (i+3 == il->nr || ip[a[i]].disres.label != ip[a[i+3]].disres.label) {
 +        ip[a[i]].disres.npair = npair;
 +        npair = 0;
 +      }
 +      }
 +    }
 +  }
 +}
 +
 +static void do_mtop(t_fileio *fio, gmx_mtop_t *mtop,gmx_bool bRead, 
 +                    int file_version)
 +{
 +  int  mt,mb,i;
 +  t_blocka dumb;
 +
 +  if (bRead)
 +    init_mtop(mtop);
 +  do_symtab(fio, &(mtop->symtab),bRead);
 +  if (bRead && debug) 
 +    pr_symtab(debug,0,"symtab",&mtop->symtab);
 +  
 +  do_symstr(fio, &(mtop->name),bRead,&(mtop->symtab));
 +  
 +  if (file_version >= 57) {
 +    do_ffparams(fio, &mtop->ffparams,bRead,file_version);
 +
 +    gmx_fio_do_int(fio,mtop->nmoltype);
 +  } else {
 +    mtop->nmoltype = 1;
 +  }
 +  if (bRead) {
 +    snew(mtop->moltype,mtop->nmoltype);
 +    if (file_version < 57) {
 +      mtop->moltype[0].name = mtop->name;
 +    }
 +  }
 +  for(mt=0; mt<mtop->nmoltype; mt++) {
 +    do_moltype(fio, &mtop->moltype[mt],bRead,&mtop->symtab,file_version,
 +             &mtop->groups);
 +  }
 +
 +  if (file_version >= 57) {
 +    gmx_fio_do_int(fio,mtop->nmolblock);
 +  } else {
 +    mtop->nmolblock = 1;
 +  }
 +  if (bRead) {
 +    snew(mtop->molblock,mtop->nmolblock);
 +  }
 +  if (file_version >= 57) {
 +    for(mb=0; mb<mtop->nmolblock; mb++) {
 +      do_molblock(fio, &mtop->molblock[mb],bRead,file_version);
 +    }
 +    gmx_fio_do_int(fio,mtop->natoms);
 +  } else {
 +    mtop->molblock[0].type = 0;
 +    mtop->molblock[0].nmol = 1;
 +    mtop->molblock[0].natoms_mol = mtop->moltype[0].atoms.nr;
 +    mtop->molblock[0].nposres_xA = 0;
 +    mtop->molblock[0].nposres_xB = 0;
 +  }
 +
 +  do_atomtypes (fio, &(mtop->atomtypes),bRead,&(mtop->symtab), file_version);
 +  if (bRead && debug) 
 +    pr_atomtypes(debug,0,"atomtypes",&mtop->atomtypes,TRUE);
 +
 +  if (file_version < 57) {
 +    /* Debug statements are inside do_idef */    
 +    do_idef (fio, &mtop->ffparams,&mtop->moltype[0],bRead,file_version);
 +    mtop->natoms = mtop->moltype[0].atoms.nr;
 +  }
 +      
 +  if(file_version >= 65)
 +  {
 +      do_cmap(fio, &mtop->ffparams.cmap_grid,bRead);
 +  }
 +  else
 +  {
 +      mtop->ffparams.cmap_grid.ngrid        = 0;
 +      mtop->ffparams.cmap_grid.grid_spacing = 0;
 +      mtop->ffparams.cmap_grid.cmapdata     = NULL;
 +  }
 +        
 +  if (file_version >= 57) {
 +    do_groups(fio, &mtop->groups,bRead,&(mtop->symtab),file_version);
 +  }
 +
 +  if (file_version < 57) {
 +    do_block(fio, &mtop->moltype[0].cgs,bRead,file_version);
 +    if (bRead && gmx_debug_at) {
 +      pr_block(debug,0,"cgs",&mtop->moltype[0].cgs,TRUE);
 +    }
 +    do_block(fio, &mtop->mols,bRead,file_version);
 +    /* Add the posres coordinates to the molblock */
 +    add_posres_molblock(mtop);
 +  }
 +  if (bRead) {
 +    if (file_version >= 57) {
 +      mtop->mols = mtop_mols(mtop);
 +    }
 +    if (gmx_debug_at) { 
 +      pr_block(debug,0,"mols",&mtop->mols,TRUE);
 +    }
 +  }
 +
 +  if (file_version < 51) {
 +    /* Here used to be the shake blocks */
 +    do_blocka(fio, &dumb,bRead,file_version);
 +    if (dumb.nr > 0)
 +      sfree(dumb.index);
 +    if (dumb.nra > 0)
 +      sfree(dumb.a);
 +  }
 +
 +  if (bRead) {
 +    close_symtab(&(mtop->symtab));
 +  }
 +}
 +
 +/* If TopOnlyOK is TRUE then we can read even future versions
 + * of tpx files, provided the file_generation hasn't changed.
 + * If it is FALSE, we need the inputrecord too, and bail out
 + * if the file is newer than the program.
 + * 
 + * The version and generation if the topology (see top of this file)
 + * are returned in the two last arguments.
 + * 
 + * If possible, we will read the inputrec even when TopOnlyOK is TRUE.
 + */
 +static void do_tpxheader(t_fileio *fio,gmx_bool bRead,t_tpxheader *tpx, 
 +                         gmx_bool TopOnlyOK, int *file_version, 
 +                         int *file_generation)
 +{
 +    char  buf[STRLEN];
 +    char  file_tag[STRLEN];
 +  gmx_bool  bDouble;
 +  int   precision;
 +  int   fver,fgen;
 +  int   idum=0;
 +  real  rdum=0;
 +
 +  gmx_fio_checktype(fio);
 +  gmx_fio_setdebug(fio,bDebugMode());
 +  
 +  /* NEW! XDR tpb file */
 +  precision = sizeof(real);
 +  if (bRead) {
 +    gmx_fio_do_string(fio,buf);
 +    if (strncmp(buf,"VERSION",7))
 +      gmx_fatal(FARGS,"Can not read file %s,\n"
 +                "             this file is from a Gromacs version which is older than 2.0\n"
 +                "             Make a new one with grompp or use a gro or pdb file, if possible",
 +                gmx_fio_getname(fio));
 +    gmx_fio_do_int(fio,precision);
 +    bDouble = (precision == sizeof(double));
 +    if ((precision != sizeof(float)) && !bDouble)
 +      gmx_fatal(FARGS,"Unknown precision in file %s: real is %d bytes "
 +                "instead of %d or %d",
 +                gmx_fio_getname(fio),precision,sizeof(float),sizeof(double));
 +    gmx_fio_setprecision(fio,bDouble);
 +    fprintf(stderr,"Reading file %s, %s (%s precision)\n",
 +          gmx_fio_getname(fio),buf,bDouble ? "double" : "single");
 +  }
 +  else {
 +    gmx_fio_write_string(fio,GromacsVersion());
 +    bDouble = (precision == sizeof(double));
 +    gmx_fio_setprecision(fio,bDouble);
 +    gmx_fio_do_int(fio,precision);
 +    fver = tpx_version;
 +    sprintf(file_tag,"%s",tpx_tag);
 +    fgen = tpx_generation;
 +  }
 +  
 +    /* Check versions! */
 +    gmx_fio_do_int(fio,fver);
 +
 +    /* This is for backward compatibility with development versions 77-79
 +     * where the tag was, mistakenly, placed before the generation,
 +     * which would cause a segv instead of a proper error message
 +     * when reading the topology only from tpx with <77 code.
 +     */
 +    if (fver >= 77 && fver <= 79)
 +    {
 +        gmx_fio_do_string(fio,file_tag);
 +    }
 +  
 +    if (fver >= 26)
 +    {
 +        gmx_fio_do_int(fio,fgen);
 +    }
 +    else
 +    {
 +        fgen = 0;
 +    }
 + 
 +    if (fver >= 81)
 +    {
 +        gmx_fio_do_string(fio,file_tag);
 +    }
 +    if (bRead)
 +    {
 +        if (fver < 77)
 +        {
 +            /* Versions before 77 don't have the tag, set it to release */
 +            sprintf(file_tag,"%s",TPX_TAG_RELEASE);
 +        }
 +
 +        if (strcmp(file_tag,tpx_tag) != 0)
 +        {
 +            fprintf(stderr,"Note: file tpx tag '%s', software tpx tag '%s'\n",
 +                    file_tag,tpx_tag);
 +
 +            /* We only support reading tpx files with the same tag as the code
 +             * or tpx files with the release tag and with lower version number.
 +             */
 +            if (!strcmp(file_tag,TPX_TAG_RELEASE) == 0 && fver < tpx_version) 
 +            {
 +                gmx_fatal(FARGS,"tpx tag/version mismatch: reading tpx file (%s) version %d, tag '%s' with program for tpx version %d, tag '%s'",
 +                          gmx_fio_getname(fio),fver,file_tag,
 +                          tpx_version,tpx_tag);
 +            }
 +        }
 +    }
 +
 +    if (file_version != NULL)
 +    {
 +        *file_version = fver;
 +    }
 +    if (file_generation != NULL)
 +    {
 +        *file_generation = fgen;
 +    }
 +   
 +  
 +  if ((fver <= tpx_incompatible_version) ||
 +      ((fver > tpx_version) && !TopOnlyOK) ||
 +      (fgen > tpx_generation))
 +    gmx_fatal(FARGS,"reading tpx file (%s) version %d with version %d program",
 +              gmx_fio_getname(fio),fver,tpx_version);
 +  
 +  do_section(fio,eitemHEADER,bRead);
 +  gmx_fio_do_int(fio,tpx->natoms);
 +  if (fver >= 28)
 +    gmx_fio_do_int(fio,tpx->ngtc);
 +  else
 +    tpx->ngtc = 0;
 +  if (fver < 62) {
 +      gmx_fio_do_int(fio,idum);
 +      gmx_fio_do_real(fio,rdum);
 +  }
 +  /*a better decision will eventually (5.0 or later) need to be made
 +    on how to treat the alchemical state of the system, which can now
 +    vary through a simulation, and cannot be completely described
 +    though a single lambda variable, or even a single state
 +    index. Eventually, should probably be a vector. MRS*/
 +  if (fver >= 79) 
 +  {
 +      gmx_fio_do_int(fio,tpx->fep_state);
 +  }
 +  gmx_fio_do_real(fio,tpx->lambda);
 +  gmx_fio_do_int(fio,tpx->bIr);
 +  gmx_fio_do_int(fio,tpx->bTop);
 +  gmx_fio_do_int(fio,tpx->bX);
 +  gmx_fio_do_int(fio,tpx->bV);
 +  gmx_fio_do_int(fio,tpx->bF);
 +  gmx_fio_do_int(fio,tpx->bBox);
 +
 +  if((fgen > tpx_generation)) {
 +    /* This can only happen if TopOnlyOK=TRUE */
 +    tpx->bIr=FALSE;
 +  }
 +}
 +
 +static int do_tpx(t_fileio *fio, gmx_bool bRead,
 +                t_inputrec *ir,t_state *state,rvec *f,gmx_mtop_t *mtop,
 +                gmx_bool bXVallocated)
 +{
 +  t_tpxheader tpx;
 +  t_inputrec  dum_ir;
 +  gmx_mtop_t  dum_top;
 +  gmx_bool        TopOnlyOK,bDum=TRUE;
 +  int         file_version,file_generation;
 +  int         i;
 +  rvec        *xptr,*vptr;
 +  int         ePBC;
 +  gmx_bool        bPeriodicMols;
 +
 +  if (!bRead) {
 +    tpx.natoms = state->natoms;
 +    tpx.ngtc   = state->ngtc;  /* need to add nnhpres here? */
 +    tpx.fep_state = state->fep_state;
 +    tpx.lambda = state->lambda[efptFEP];
 +    tpx.bIr  = (ir       != NULL);
 +    tpx.bTop = (mtop     != NULL);
 +    tpx.bX   = (state->x != NULL);
 +    tpx.bV   = (state->v != NULL);
 +    tpx.bF   = (f        != NULL);
 +    tpx.bBox = TRUE;
 +  }
 +  
 +  TopOnlyOK = (ir==NULL);
 +  
 +  do_tpxheader(fio,bRead,&tpx,TopOnlyOK,&file_version,&file_generation);
 +
 +  if (bRead) {
 +    state->flags  = 0;
 +    /* state->lambda = tpx.lambda;*/ /*remove this eventually? */
 +    /* The init_state calls initialize the Nose-Hoover xi integrals to zero */
 +    if (bXVallocated) {
 +      xptr = state->x;
 +      vptr = state->v;
 +      init_state(state,0,tpx.ngtc,0,0,0);  /* nose-hoover chains */ /* eventually, need to add nnhpres here? */
 +      state->natoms = tpx.natoms;
 +      state->nalloc = tpx.natoms;
 +      state->x = xptr;
 +      state->v = vptr;
 +    } else {
 +        init_state(state,tpx.natoms,tpx.ngtc,0,0,0);  /* nose-hoover chains */
 +    }
 +  }
 +
 +#define do_test(fio,b,p) if (bRead && (p!=NULL) && !b) gmx_fatal(FARGS,"No %s in %s",#p,gmx_fio_getname(fio)) 
 +
 +  do_test(fio,tpx.bBox,state->box);
 +  do_section(fio,eitemBOX,bRead);
 +  if (tpx.bBox) {
 +    gmx_fio_ndo_rvec(fio,state->box,DIM);
 +    if (file_version >= 51) {
 +      gmx_fio_ndo_rvec(fio,state->box_rel,DIM);
 +    } else {
 +      /* We initialize box_rel after reading the inputrec */
 +      clear_mat(state->box_rel);
 +    }
 +    if (file_version >= 28) {
 +      gmx_fio_ndo_rvec(fio,state->boxv,DIM);
 +      if (file_version < 56) {
 +      matrix mdum;
 +      gmx_fio_ndo_rvec(fio,mdum,DIM);
 +      }
 +    }
 +  }
 +  
 +  if (state->ngtc > 0 && file_version >= 28) {
 +    real *dumv;
 +    /*ndo_double(state->nosehoover_xi,state->ngtc,bDum);*/
 +    /*ndo_double(state->nosehoover_vxi,state->ngtc,bDum);*/
 +    /*ndo_double(state->therm_integral,state->ngtc,bDum);*/
 +    snew(dumv,state->ngtc);
 +    if (file_version < 69) {
 +      bDum=gmx_fio_ndo_real(fio,dumv,state->ngtc);
 +    }
 +    /* These used to be the Berendsen tcoupl_lambda's */
 +    bDum=gmx_fio_ndo_real(fio,dumv,state->ngtc);
 +    sfree(dumv);
 +  }
 +
 +  /* Prior to tpx version 26, the inputrec was here.
 +   * I moved it to enable partial forward-compatibility
 +   * for analysis/viewer programs.
 +   */
 +  if(file_version<26) {
 +    do_test(fio,tpx.bIr,ir);
 +    do_section(fio,eitemIR,bRead);
 +    if (tpx.bIr) {
 +      if (ir) {
 +      do_inputrec(fio, ir,bRead,file_version,
 +                    mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",ir,FALSE);
 +      }
 +      else {
 +      do_inputrec(fio, &dum_ir,bRead,file_version,
 +                    mtop ? &mtop->ffparams.fudgeQQ :NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",&dum_ir,FALSE);
 +      done_inputrec(&dum_ir);
 +      }
 +      
 +    }
 +  }
 +  
 +  do_test(fio,tpx.bTop,mtop);
 +  do_section(fio,eitemTOP,bRead);
 +  if (tpx.bTop) {
 +    int mtop_file_version = file_version;
 +    /*allow reading of Gromacs 4.6 files*/
 +    if (mtop_file_version>80 && mtop_file_version<90)
 +    {
 +        mtop_file_version = 79;
 +    }
 +    if (mtop) {
 +      do_mtop(fio,mtop,bRead, mtop_file_version);
 +    } else {
 +      do_mtop(fio,&dum_top,bRead,mtop_file_version);
 +      done_mtop(&dum_top,TRUE);
 +    }
 +  }
 +  do_test(fio,tpx.bX,state->x);  
 +  do_section(fio,eitemX,bRead);
 +  if (tpx.bX) {
 +    if (bRead) {
 +      state->flags |= (1<<estX);
 +    }
 +    gmx_fio_ndo_rvec(fio,state->x,state->natoms);
 +  }
 +  
 +  do_test(fio,tpx.bV,state->v);
 +  do_section(fio,eitemV,bRead);
 +  if (tpx.bV) {
 +    if (bRead) {
 +      state->flags |= (1<<estV);
 +    }
 +    gmx_fio_ndo_rvec(fio,state->v,state->natoms);
 +  }
 +
 +  do_test(fio,tpx.bF,f);
 +  do_section(fio,eitemF,bRead);
 +  if (tpx.bF) gmx_fio_ndo_rvec(fio,f,state->natoms);
 +
 +  /* Starting with tpx version 26, we have the inputrec
 +   * at the end of the file, so we can ignore it 
 +   * if the file is never than the software (but still the
 +   * same generation - see comments at the top of this file.
 +   *
 +   * 
 +   */
 +  ePBC = -1;
 +  bPeriodicMols = FALSE;
 +  if (file_version >= 26) {
 +    do_test(fio,tpx.bIr,ir);
 +    do_section(fio,eitemIR,bRead);
 +    if (tpx.bIr) {
 +      if (file_version >= 53) {
 +      /* Removed the pbc info from do_inputrec, since we always want it */
 +      if (!bRead) {
 +        ePBC          = ir->ePBC;
 +        bPeriodicMols = ir->bPeriodicMols;
 +      }
 +      gmx_fio_do_int(fio,ePBC);
 +      gmx_fio_do_gmx_bool(fio,bPeriodicMols);
 +      }
 +      if (file_generation <= tpx_generation && ir) {
 +      do_inputrec(fio, ir,bRead,file_version,mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",ir,FALSE);
 +      if (file_version < 51)
 +        set_box_rel(ir,state);
 +      if (file_version < 53) {
 +        ePBC          = ir->ePBC;
 +        bPeriodicMols = ir->bPeriodicMols;
 +      }
 +      }
 +      if (bRead && ir && file_version >= 53) {
 +      /* We need to do this after do_inputrec, since that initializes ir */
 +      ir->ePBC          = ePBC;
 +      ir->bPeriodicMols = bPeriodicMols;
 +      }
 +    }
 +  }
 +
 +    if (bRead)
 +    {
 +        if (tpx.bIr && ir)
 +        {
 +            if (state->ngtc == 0)
 +            {
 +                /* Reading old version without tcoupl state data: set it */
 +                init_gtc_state(state,ir->opts.ngtc,0,ir->opts.nhchainlength);
 +            }
 +            if (tpx.bTop && mtop)
 +            {
 +                if (file_version < 57)
 +                {
 +                    if (mtop->moltype[0].ilist[F_DISRES].nr > 0)
 +                    {
 +                        ir->eDisre = edrSimple;
 +                    }
 +                    else
 +                    {
 +                        ir->eDisre = edrNone;
 +                    }
 +                }
 +                set_disres_npair(mtop);
 +            }
 +        }
 +
 +        if (tpx.bTop && mtop)
 +        {
 +            gmx_mtop_finalize(mtop);
 +        }
 +
 +        if (file_version >= 57)
 +        {
 +            char *env;
 +            int  ienv;
 +            env = getenv("GMX_NOCHARGEGROUPS");
 +            if (env != NULL)
 +            {
 +                sscanf(env,"%d",&ienv);
 +                fprintf(stderr,"\nFound env.var. GMX_NOCHARGEGROUPS = %d\n",
 +                        ienv);
 +                if (ienv > 0)
 +                {
 +                    fprintf(stderr,
 +                            "Will make single atomic charge groups in non-solvent%s\n",
 +                            ienv > 1 ? " and solvent" : "");
 +                    gmx_mtop_make_atomic_charge_groups(mtop,ienv==1);
 +                }
 +                fprintf(stderr,"\n");
 +            }
 +        }
 +    }
 +
 +    return ePBC;
 +}
 +
 +/************************************************************
 + *
 + *  The following routines are the exported ones
 + *
 + ************************************************************/
 +
 +t_fileio *open_tpx(const char *fn,const char *mode)
 +{
 +  return gmx_fio_open(fn,mode);
 +}    
 + 
 +void close_tpx(t_fileio *fio)
 +{
 +  gmx_fio_close(fio);
 +}
 +
 +void read_tpxheader(const char *fn, t_tpxheader *tpx, gmx_bool TopOnlyOK,
 +                    int *file_version, int *file_generation)
 +{
 +  t_fileio *fio;
 +
 +  fio = open_tpx(fn,"r");
 +  do_tpxheader(fio,TRUE,tpx,TopOnlyOK,file_version,file_generation);
 +  close_tpx(fio);
 +}
 +
 +void write_tpx_state(const char *fn,
 +                   t_inputrec *ir,t_state *state,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +
 +  fio = open_tpx(fn,"w");
 +  do_tpx(fio,FALSE,ir,state,NULL,mtop,FALSE);
 +  close_tpx(fio);
 +}
 +
 +void read_tpx_state(const char *fn,
 +                  t_inputrec *ir,t_state *state,rvec *f,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +      
 +  fio = open_tpx(fn,"r");
 +  do_tpx(fio,TRUE,ir,state,f,mtop,FALSE);
 +  close_tpx(fio);
 +}
 +
 +int read_tpx(const char *fn,
 +           t_inputrec *ir, matrix box,int *natoms,
 +           rvec *x,rvec *v,rvec *f,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +  t_state state;
 +  int ePBC;
 +
 +  state.x = x;
 +  state.v = v;
 +  fio = open_tpx(fn,"r");
 +  ePBC = do_tpx(fio,TRUE,ir,&state,f,mtop,TRUE);
 +  close_tpx(fio);
 +  *natoms = state.natoms;
 +  if (box) 
 +    copy_mat(state.box,box);
 +  state.x = NULL;
 +  state.v = NULL;
 +  done_state(&state);
 +
 +  return ePBC;
 +}
 +
 +int read_tpx_top(const char *fn,
 +               t_inputrec *ir, matrix box,int *natoms,
 +               rvec *x,rvec *v,rvec *f,t_topology *top)
 +{
 +  gmx_mtop_t mtop;
 +  t_topology *ltop;
 +  int ePBC;
 +
 +  ePBC = read_tpx(fn,ir,box,natoms,x,v,f,&mtop);
 +  
 +  *top = gmx_mtop_t_to_t_topology(&mtop);
 +
 +  return ePBC;
 +}
 +
 +gmx_bool fn2bTPX(const char *file)
 +{
 +  switch (fn2ftp(file)) {
 +  case efTPR:
 +  case efTPB:
 +  case efTPA:
 +    return TRUE;
 +  default:
 +    return FALSE;
 +  }
 +}
 +
 +gmx_bool read_tps_conf(const char *infile,char *title,t_topology *top,int *ePBC,
 +                 rvec **x,rvec **v,matrix box,gmx_bool bMass)
 +{
 +  t_tpxheader  header;
 +  int          natoms,i,version,generation;
 +  gmx_bool         bTop,bXNULL=FALSE;
 +  gmx_mtop_t   *mtop;
 +  t_topology   *topconv;
 +  gmx_atomprop_t aps;
 +  
 +  bTop = fn2bTPX(infile);
 +  *ePBC = -1;
 +  if (bTop) {
 +    read_tpxheader(infile,&header,TRUE,&version,&generation);
 +    if (x)
 +      snew(*x,header.natoms);
 +    if (v)
 +      snew(*v,header.natoms);
 +    snew(mtop,1);
 +    *ePBC = read_tpx(infile,NULL,box,&natoms,
 +                   (x==NULL) ? NULL : *x,(v==NULL) ? NULL : *v,NULL,mtop);
 +    *top = gmx_mtop_t_to_t_topology(mtop);
 +    sfree(mtop);
 +    strcpy(title,*top->name);
 +    tpx_make_chain_identifiers(&top->atoms,&top->mols);
 +  }
 +  else {
 +    get_stx_coordnum(infile,&natoms);
 +    init_t_atoms(&top->atoms,natoms,(fn2ftp(infile) == efPDB));
 +    if (x == NULL)
 +    {
 +        snew(x,1);
 +        bXNULL = TRUE;
 +    }
 +    snew(*x,natoms);
 +    if (v)
 +      snew(*v,natoms);
 +    read_stx_conf(infile,title,&top->atoms,*x,(v==NULL) ? NULL : *v,ePBC,box);
 +    if (bXNULL)
 +    {
 +      sfree(*x);
 +      sfree(x);
 +    }
 +    if (bMass) {
 +      aps = gmx_atomprop_init();
 +      for(i=0; (i<natoms); i++)
 +      if (!gmx_atomprop_query(aps,epropMass,
 +                              *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                              *top->atoms.atomname[i],
 +                              &(top->atoms.atom[i].m))) {
 +        if (debug) 
 +          fprintf(debug,"Can not find mass for atom %s %d %s, setting to 1\n",
 +                  *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                  top->atoms.resinfo[top->atoms.atom[i].resind].nr,
 +                  *top->atoms.atomname[i]);
 +      }
 +      gmx_atomprop_destroy(aps);
 +    }
 +    top->idef.ntypes=-1;
 +  }
 +
 +  return bTop;
 +}
index 2f4218c6641586a0014e5f149fc9f3e17116f214,0000000000000000000000000000000000000000..82922dce52124ae79373b0906038ac4ed2817d58
mode 100644,000000..100644
--- /dev/null
@@@ -1,1601 -1,0 +1,1616 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +/* This file is completely threadsafe - please keep it that way! */
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "names.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "vec.h"
 +#include "macros.h"
 +
 +
 +int pr_indent(FILE *fp,int n)
 +{
 +  int i;
 +
 +  for (i=0; i<n; i++) (void) fprintf(fp," ");
 +  return n;
 +}
 +
 +int available(FILE *fp,void *p,int indent,const char *title)
 +{
 +  if (!p) {
 +    if (indent > 0)
 +      pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s: not available\n",title);
 +  }
 +  return (p!=NULL);
 +}
 +
 +int pr_title(FILE *fp,int indent,const char *title)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s:\n",title);
 +  return (indent+INDENT);
 +}
 +
 +int pr_title_n(FILE *fp,int indent,const char *title,int n)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s (%d):\n",title,n);
 +  return (indent+INDENT);
 +}
 +
 +int pr_title_nxn(FILE *fp,int indent,const char *title,int n1,int n2)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s (%dx%d):\n",title,n1,n2);
 +  return (indent+INDENT);
 +}
 +
 +void pr_ivec(FILE *fp,int indent,const char *title,int vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]=%d\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +void pr_ivec_block(FILE *fp,int indent,const char *title,int vec[],int n, gmx_bool bShowNumbers)
 +{
 +    int i,j;
 +    
 +    if (available(fp,vec,indent,title))
 +    {
 +        indent=pr_title_n(fp,indent,title,n);
 +        i = 0;
 +        while (i < n)
 +        {
 +            j = i+1;
 +            while (j < n && vec[j] == vec[j-1]+1)
 +            {
 +                j++;
 +            }
 +            /* Print consecutive groups of 3 or more as blocks */
 +            if (j - i < 3)
 +            {
 +                while(i < j)
 +                {
 +                    (void) pr_indent(fp,indent);
 +                    (void) fprintf(fp,"%s[%d]=%d\n",
 +                                   title,bShowNumbers?i:-1,vec[i]);
 +                    i++;
 +                }
 +            }
 +            else
 +            {
 +                (void) pr_indent(fp,indent);
 +                (void) fprintf(fp,"%s[%d,...,%d] = {%d,...,%d}\n",
 +                               title,
 +                               bShowNumbers?i:-1,
 +                               bShowNumbers?j-1:-1,
 +                               vec[i],vec[j-1]); 
 +                i = j;
 +            }
 +        }
 +    }
 +}
 +
 +void pr_bvec(FILE *fp,int indent,const char *title,gmx_bool vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]=%s\n",title,bShowNumbers?i:-1,
 +                       EBOOL(vec[i]));
 +        }
 +    }
 +}
 +
 +void pr_ivecs(FILE *fp,int indent,const char *title,ivec vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +
 +  if (available(fp,vec,indent,title))
 +    {  
 +      indent=pr_title_nxn(fp,indent,title,n,DIM);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={",title,bShowNumbers?i:-1);
 +          for (j=0; j<DIM; j++)
 +            {
 +              if (j!=0) (void) fprintf(fp,", ");
 +              fprintf(fp,"%d",vec[i][j]);
 +            }
 +          (void) fprintf(fp,"}\n");
 +        }
 +    }
 +}
 +
 +void pr_rvec(FILE *fp,int indent,const char *title,real vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          pr_indent(fp,indent);
 +          fprintf(fp,"%s[%d]=%12.5e\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +void pr_dvec(FILE *fp,int indent,const char *title,double vec[],int n, gmx_bool bShowNumbers)
 +{
 +      int i;
 +      
 +      if (available(fp,vec,indent,title))
 +    {  
 +              indent=pr_title_n(fp,indent,title,n);
 +              for (i=0; i<n; i++)
 +        {
 +                      pr_indent(fp,indent);
 +                      fprintf(fp,"%s[%d]=%12.5e\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +
 +/*
 +void pr_mat(FILE *fp,int indent,char *title,matrix m)
 +{
 +  int i,j;
 +  
 +  if (available(fp,m,indent,title)) {  
 +    indent=pr_title_n(fp,indent,title,n);
 +    for(i=0; i<n; i++) {
 +      pr_indent(fp,indent);
 +      fprintf(fp,"%s[%d]=%12.5e %12.5e %12.5e\n",
 +            title,bShowNumbers?i:-1,m[i][XX],m[i][YY],m[i][ZZ]);
 +    }
 +  }
 +}
 +*/
 +
 +void pr_rvecs_len(FILE *fp,int indent,const char *title,rvec vec[],int n)
 +{
 +  int i,j;
 +
 +  if (available(fp,vec,indent,title)) {  
 +    indent=pr_title_nxn(fp,indent,title,n,DIM);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"%s[%5d]={",title,i);
 +      for (j=0; j<DIM; j++) {
 +      if (j != 0) 
 +        (void) fprintf(fp,", ");
 +      (void) fprintf(fp,"%12.5e",vec[i][j]);
 +      }
 +      (void) fprintf(fp,"} len=%12.5e\n",norm(vec[i]));
 +    }
 +  }
 +}
 +
 +void pr_rvecs(FILE *fp,int indent,const char *title,rvec vec[],int n)
 +{
 +  const char *fshort = "%12.5e";
 +  const char *flong  = "%15.8e";
 +  const char *format;
 +  int i,j;
 +
 +  if (getenv("LONGFORMAT") != NULL)
 +    format = flong;
 +  else
 +    format = fshort;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    indent=pr_title_nxn(fp,indent,title,n,DIM);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"%s[%5d]={",title,i);
 +      for (j=0; j<DIM; j++) {
 +      if (j != 0) 
 +        (void) fprintf(fp,", ");
 +      (void) fprintf(fp,format,vec[i][j]);
 +      }
 +      (void) fprintf(fp,"}\n");
 +    }
 +  }
 +}
 +
 +
 +void pr_reals(FILE *fp,int indent,const char *title,real *vec,int n)
 +{
 +  int i;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s:\t",title);
 +    for(i=0; i<n; i++)
 +      fprintf(fp,"  %10g",vec[i]);
 +    (void) fprintf(fp,"\n");
 +  }
 +}
 +
 +void pr_doubles(FILE *fp,int indent,const char *title,double *vec,int n)
 +{
 +  int i;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s:\t",title);
 +    for(i=0; i<n; i++)
 +      fprintf(fp,"  %10g",vec[i]);
 +    (void) fprintf(fp,"\n");
 +  }
 +}
 +
 +static void pr_int(FILE *fp,int indent,const char *title,int i)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %d\n",title,i);
 +}
 +
 +static void pr_gmx_large_int(FILE *fp,int indent,const char *title,gmx_large_int_t i)
 +{
 +  char buf[STEPSTRSIZE];
 +
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %s\n",title,gmx_step_str(i,buf));
 +}
 +
 +static void pr_real(FILE *fp,int indent,const char *title,real r)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %g\n",title,r);
 +}
 +
 +static void pr_double(FILE *fp,int indent,const char *title,double d)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %g\n",title,d);
 +}
 +
 +static void pr_str(FILE *fp,int indent,const char *title,const char *s)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %s\n",title,s);
 +}
 +
 +void pr_qm_opts(FILE *fp,int indent,const char *title,t_grpopts *opts)
 +{
 +  int i,m,j;
 +
 +  fprintf(fp,"%s:\n",title);
 +  
 +  pr_int(fp,indent,"ngQM",opts->ngQM);
 +  if (opts->ngQM > 0) {
 +    pr_ivec(fp,indent,"QMmethod",opts->QMmethod,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMbasis",opts->QMbasis,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMcharge",opts->QMcharge,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMmult",opts->QMmult,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bSH",opts->bSH,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"CASorbitals",opts->CASorbitals,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"CASelectrons",opts->CASelectrons,opts->ngQM,FALSE);
 +    pr_rvec(fp,indent,"SAon",opts->SAon,opts->ngQM,FALSE);
 +    pr_rvec(fp,indent,"SAon",opts->SAon,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"SAsteps",opts->SAsteps,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bOPT",opts->bOPT,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bTS",opts->bTS,opts->ngQM,FALSE);
 +  }
 +}
 +
 +static void pr_grp_opts(FILE *out,int indent,const char *title,t_grpopts *opts,
 +                      gmx_bool bMDPformat)
 +{
 +  int i,m,j;
 +
 +  if (!bMDPformat)
 +    fprintf(out,"%s:\n",title);
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"nrdf%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->nrdf[i]);
 +  fprintf(out,"\n");
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"ref-t%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->ref_t[i]);
 +  fprintf(out,"\n");
 +
 +  pr_indent(out,indent);
 +  fprintf(out,"tau-t%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->tau_t[i]);
 +  fprintf(out,"\n");  
 +  
 +  /* Pretty-print the simulated annealing info */
 +  fprintf(out,"anneal%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10s",EANNEAL(opts->annealing[i]));
 +  fprintf(out,"\n");  
 + 
 +  fprintf(out,"ann-npoints%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10d",opts->anneal_npoints[i]);
 +  fprintf(out,"\n");  
 + 
 +  for(i=0; (i<opts->ngtc); i++) {
 +    if(opts->anneal_npoints[i]>0) {
 +      fprintf(out,"ann. times [%d]:\t",i);
 +      for(j=0; (j<opts->anneal_npoints[i]); j++)
 +      fprintf(out,"  %10.1f",opts->anneal_time[i][j]);
 +      fprintf(out,"\n");  
 +      fprintf(out,"ann. temps [%d]:\t",i);
 +      for(j=0; (j<opts->anneal_npoints[i]); j++)
 +      fprintf(out,"  %10.1f",opts->anneal_temp[i][j]);
 +      fprintf(out,"\n");  
 +    }
 +  }
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"acc:\t");
 +  for(i=0; (i<opts->ngacc); i++)
 +    for(m=0; (m<DIM); m++)
 +      fprintf(out,"  %10g",opts->acc[i][m]);
 +  fprintf(out,"\n");
 +
 +  pr_indent(out,indent);
 +  fprintf(out,"nfreeze:");
 +  for(i=0; (i<opts->ngfrz); i++)
 +    for(m=0; (m<DIM); m++)
 +      fprintf(out,"  %10s",opts->nFreeze[i][m] ? "Y" : "N");
 +  fprintf(out,"\n");
 +
 +
 +  for(i=0; (i<opts->ngener); i++) {
 +    pr_indent(out,indent);
 +    fprintf(out,"energygrp-flags[%3d]:",i);
 +    for(m=0; (m<opts->ngener); m++)
 +      fprintf(out," %d",opts->egp_flags[opts->ngener*i+m]);
 +    fprintf(out,"\n");
 +  }
 +
 +  fflush(out);
 +}
 +
 +static void pr_matrix(FILE *fp,int indent,const char *title,rvec *m,
 +                    gmx_bool bMDPformat)
 +{
 +  if (bMDPformat)
 +    fprintf(fp,"%-10s    = %g %g %g %g %g %g\n",title,
 +          m[XX][XX],m[YY][YY],m[ZZ][ZZ],m[XX][YY],m[XX][ZZ],m[YY][ZZ]);
 +  else
 +    pr_rvecs(fp,indent,title,m,DIM);
 +}
 +
 +static void pr_cosine(FILE *fp,int indent,const char *title,t_cosines *cos,
 +                    gmx_bool bMDPformat)
 +{
 +  int j;
 +  
 +  if (bMDPformat) {
 +    fprintf(fp,"%s = %d\n",title,cos->n);
 +  }
 +  else {
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"n = %d\n",cos->n);
 +    if (cos->n > 0) {
 +      (void) pr_indent(fp,indent+2);
 +      fprintf(fp,"a =");
 +      for(j=0; (j<cos->n); j++)
 +      fprintf(fp," %e",cos->a[j]);
 +      fprintf(fp,"\n");
 +      (void) pr_indent(fp,indent+2);
 +      fprintf(fp,"phi =");
 +      for(j=0; (j<cos->n); j++)
 +      fprintf(fp," %e",cos->phi[j]);
 +      fprintf(fp,"\n");
 +    }
 +  }
 +}
 +
 +#define PS(t,s) pr_str(fp,indent,t,s)
 +#define PI(t,s) pr_int(fp,indent,t,s)
 +#define PSTEP(t,s) pr_gmx_large_int(fp,indent,t,s)
 +#define PR(t,s) pr_real(fp,indent,t,s)
 +#define PD(t,s) pr_double(fp,indent,t,s)
 +
 +static void pr_pullgrp(FILE *fp,int indent,int g,t_pullgrp *pg)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"pull-group %d:\n",g);
 +  indent += 2;
 +  pr_ivec_block(fp,indent,"atom",pg->ind,pg->nat,TRUE);
 +  pr_rvec(fp,indent,"weight",pg->weight,pg->nweight,TRUE);
 +  PI("pbcatom",pg->pbcatom);
 +  pr_rvec(fp,indent,"vec",pg->vec,DIM,TRUE);
 +  pr_rvec(fp,indent,"init",pg->init,DIM,TRUE);
 +  PR("rate",pg->rate);
 +  PR("k",pg->k);
 +  PR("kB",pg->kB);
 +}
 +
 +static void pr_simtempvals(FILE *fp,int indent,t_simtemp *simtemp, int n_lambda, gmx_bool bMDPformat)
 +{
 +    PR("simtemp_low",simtemp->simtemp_low);
 +    PR("simtemp_high",simtemp->simtemp_high);
 +    PS("simulated-tempering-scaling",ESIMTEMP(simtemp->eSimTempScale));
 +    pr_rvec(fp,indent,"simulated tempering temperatures",simtemp->temperatures,n_lambda,TRUE);
 +}
 +
 +static void pr_expandedvals(FILE *fp,int indent,t_expanded *expand, int n_lambda, gmx_bool bMDPformat)
 +{
 +
 +    PI("nstexpanded", expand->nstexpanded);
 +    PS("lambda-stats", elamstats_names[expand->elamstats]);
 +    PS("lambda-mc-move", elmcmove_names[expand->elmcmove]);
 +    PI("lmc-repeats",expand->lmc_repeats);
 +    PI("lmc-gibbsdelta",expand->gibbsdeltalam);
 +    PI("lmc-nstart",expand->lmc_forced_nstart);
 +    PS("symmetrized-transition-matrix", EBOOL(expand->bSymmetrizedTMatrix));
 +    PI("nst-transition-matrix",expand->nstTij);
 +    PI("mininum-var-min",expand->minvarmin); /*default is reasonable */
 +    PI("weight-c-range",expand->c_range); /* default is just C=0 */
 +    PR("wl-scale",expand->wl_scale);
 +    PR("init-wl-delta",expand->init_wl_delta);
 +    PR("wl-ratio",expand->wl_ratio);
 +    PS("bWLoneovert",EBOOL(expand->bWLoneovert));
 +    PI("lmc-seed",expand->lmc_seed);
 +    PR("mc-temperature",expand->mc_temp);
 +    PS("lmc-weights-equil",elmceq_names[expand->elmceq]);
 +    if (expand->elmceq == elmceqNUMATLAM)
 +    {
 +        PI("weight-equil-number-all-lambda",expand->equil_n_at_lam);
 +    }
 +    if (expand->elmceq == elmceqSAMPLES)
 +    {
 +        PI("weight-equil-number-samples",expand->equil_samples);
 +    }
 +    if (expand->elmceq == elmceqSTEPS)
 +    {
 +        PI("weight-equil-number-steps",expand->equil_steps);
 +    }
 +    if (expand->elmceq == elmceqWLDELTA)
 +    {
 +        PR("weight-equil-wl-delta",expand->equil_wl_delta);
 +    }
 +    if (expand->elmceq == elmceqRATIO)
 +    {
 +        PR("weight-equil-count-ratio",expand->equil_ratio);
 +    }
 +
 +    pr_indent(fp,indent);
 +    pr_rvec(fp,indent,"init-lambda-weights",expand->init_lambda_weights,n_lambda,TRUE);
 +    PS("init-weights",EBOOL(expand->bInit_weights));
 +}
 +
 +static void pr_fepvals(FILE *fp,int indent,t_lambda *fep, gmx_bool bMDPformat)
 +{
 +    int i,j;
 +
 +    PI("nstdhdl",fep->nstdhdl);
 +    PI("init-lambda-state",fep->init_fep_state);
 +    PR("init-lambda",fep->init_lambda);
 +    PR("delta-lambda",fep->delta_lambda);
 +    if (!bMDPformat)
 +    {
 +        PI("n-lambdas",fep->n_lambda);
 +    }
 +    if (fep->n_lambda > 0)
 +    {
 +        pr_indent(fp,indent);
++        fprintf(fp,"separate-dvdl%s\n",bMDPformat ? " = " : ":");
++        for(i=0; i<efptNR; i++)
++        {
++            fprintf(fp,"%18s = ",efpt_names[i]);
++            if (fep->separate_dvdl[i])
++            {
++                fprintf(fp,"  TRUE");
++            }
++            else
++            {
++                fprintf(fp,"  FALSE");
++            }
++            fprintf(fp,"\n");
++        }
 +        fprintf(fp,"all-lambdas%s\n",bMDPformat ? " = " : ":");
 +        for(i=0; i<efptNR; i++) {
 +            fprintf(fp,"%18s = ",efpt_names[i]);
 +            for(j=0; j<fep->n_lambda; j++)
 +            {
 +                fprintf(fp,"  %10g",fep->all_lambda[i][j]);
 +            }
 +            fprintf(fp,"\n");
 +        }
 +    }
++    PI("calc-lambda-neighbors",fep->lambda_neighbors);
 +
 +    PR("sc-alpha",fep->sc_alpha);
 +    PS("bScCoul",EBOOL(fep->bScCoul));
 +    PS("bScPrintEnergy",EBOOL(fep->bPrintEnergy));
 +    PI("sc-power",fep->sc_power);
 +    PR("sc-r-power",fep->sc_r_power);
 +    PR("sc-sigma",fep->sc_sigma);
 +    PR("sc-sigma-min",fep->sc_sigma_min);
 +    PS("separate-dhdl-file", SEPDHDLFILETYPE(fep->separate_dhdl_file));
 +    PS("dhdl-derivatives", DHDLDERIVATIVESTYPE(fep->dhdl_derivatives));
 +    PI("dh-hist-size", fep->dh_hist_size);
 +    PD("dh-hist-spacing", fep->dh_hist_spacing);
 +};
 +
 +static void pr_pull(FILE *fp,int indent,t_pull *pull)
 +{
 +  int g;
 +
 +  PS("pull-geometry",EPULLGEOM(pull->eGeom));
 +  pr_ivec(fp,indent,"pull-dim",pull->dim,DIM,TRUE);
 +  PR("pull-r1",pull->cyl_r1);
 +  PR("pull-r0",pull->cyl_r0);
 +  PR("pull-constr-tol",pull->constr_tol);
 +  PI("pull-nstxout",pull->nstxout);
 +  PI("pull-nstfout",pull->nstfout);
 +  PI("pull-ngrp",pull->ngrp);
 +  for(g=0; g<pull->ngrp+1; g++)
 +    pr_pullgrp(fp,indent,g,&pull->grp[g]);
 +}
 +
 +static void pr_rotgrp(FILE *fp,int indent,int g,t_rotgrp *rotg)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"rotation_group %d:\n",g);
 +  indent += 2;
 +  PS("type",EROTGEOM(rotg->eType));
 +  PS("massw",EBOOL(rotg->bMassW));
 +  pr_ivec_block(fp,indent,"atom",rotg->ind,rotg->nat,TRUE);
 +  pr_rvecs(fp,indent,"x_ref",rotg->x_ref,rotg->nat);
 +  pr_rvec(fp,indent,"vec",rotg->vec,DIM,TRUE);
 +  pr_rvec(fp,indent,"pivot",rotg->pivot,DIM,TRUE);
 +  PR("rate",rotg->rate);
 +  PR("k",rotg->k);
 +  PR("slab_dist",rotg->slab_dist);
 +  PR("min_gaussian",rotg->min_gaussian);
 +  PR("epsilon",rotg->eps);
 +  PS("fit_method",EROTFIT(rotg->eFittype));
 +  PI("potfitangle_nstep",rotg->PotAngle_nstep);
 +  PR("potfitangle_step",rotg->PotAngle_step);
 +}
 +
 +static void pr_rot(FILE *fp,int indent,t_rot *rot)
 +{
 +  int g;
 +
 +  PI("rot_nstrout",rot->nstrout);
 +  PI("rot_nstsout",rot->nstsout);
 +  PI("rot_ngrp",rot->ngrp);
 +  for(g=0; g<rot->ngrp; g++)
 +    pr_rotgrp(fp,indent,g,&rot->grp[g]);
 +}
 +
 +void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
 +                 gmx_bool bMDPformat)
 +{
 +  const char *infbuf="inf";
 +  int  i;
 +  
 +  if (available(fp,ir,indent,title)) {
 +    if (!bMDPformat)
 +      indent=pr_title(fp,indent,title);
 +    PS("integrator",EI(ir->eI));
 +    PSTEP("nsteps",ir->nsteps);
 +    PSTEP("init-step",ir->init_step);
 +    PS("cutoff-scheme",ECUTSCHEME(ir->cutoff_scheme));
 +    PS("ns_type",ENS(ir->ns_type));
 +    PI("nstlist",ir->nstlist);
 +    PI("ndelta",ir->ndelta);
 +    PI("nstcomm",ir->nstcomm);
 +    PS("comm-mode",ECOM(ir->comm_mode));
 +    PI("nstlog",ir->nstlog);
 +    PI("nstxout",ir->nstxout);
 +    PI("nstvout",ir->nstvout);
 +    PI("nstfout",ir->nstfout);
 +    PI("nstcalcenergy",ir->nstcalcenergy);
 +    PI("nstenergy",ir->nstenergy);
 +    PI("nstxtcout",ir->nstxtcout);
 +    PR("init-t",ir->init_t);
 +    PR("delta-t",ir->delta_t);
 +    
 +    PR("xtcprec",ir->xtcprec);
 +    PR("fourierspacing",ir->fourier_spacing);
 +    PI("nkx",ir->nkx);
 +    PI("nky",ir->nky);
 +    PI("nkz",ir->nkz);
 +    PI("pme-order",ir->pme_order);
 +    PR("ewald-rtol",ir->ewald_rtol);
 +    PR("ewald-geometry",ir->ewald_geometry);
 +    PR("epsilon-surface",ir->epsilon_surface);
 +    PS("optimize-fft",EBOOL(ir->bOptFFT));
 +    PS("ePBC",EPBC(ir->ePBC));
 +    PS("bPeriodicMols",EBOOL(ir->bPeriodicMols));
 +    PS("bContinuation",EBOOL(ir->bContinuation));
 +    PS("bShakeSOR",EBOOL(ir->bShakeSOR));
 +    PS("etc",ETCOUPLTYPE(ir->etc));
 +    PS("bPrintNHChains",EBOOL(ir->bPrintNHChains));
 +    PI("nsttcouple",ir->nsttcouple);
 +    PS("epc",EPCOUPLTYPE(ir->epc));
 +    PS("epctype",EPCOUPLTYPETYPE(ir->epct));
 +    PI("nstpcouple",ir->nstpcouple);
 +    PR("tau-p",ir->tau_p);
 +    pr_matrix(fp,indent,"ref-p",ir->ref_p,bMDPformat);
 +    pr_matrix(fp,indent,"compress",ir->compress,bMDPformat);
 +    PS("refcoord-scaling",EREFSCALINGTYPE(ir->refcoord_scaling));
 +    if (bMDPformat)
 +      fprintf(fp,"posres-com  = %g %g %g\n",ir->posres_com[XX],
 +            ir->posres_com[YY],ir->posres_com[ZZ]);
 +    else
 +      pr_rvec(fp,indent,"posres-com",ir->posres_com,DIM,TRUE);
 +    if (bMDPformat)
 +      fprintf(fp,"posres-comB = %g %g %g\n",ir->posres_comB[XX],
 +            ir->posres_comB[YY],ir->posres_comB[ZZ]);
 +    else
 +      pr_rvec(fp,indent,"posres-comB",ir->posres_comB,DIM,TRUE);
 +    PR("verlet-buffer-drift",ir->verletbuf_drift);
 +    PR("rlist",ir->rlist);
 +    PR("rlistlong",ir->rlistlong);
 +    PR("nstcalclr",ir->nstcalclr);
 +    PR("rtpi",ir->rtpi);
 +    PS("coulombtype",EELTYPE(ir->coulombtype));
 +    PS("coulomb-modifier",INTMODIFIER(ir->coulomb_modifier));
 +    PR("rcoulomb-switch",ir->rcoulomb_switch);
 +    PR("rcoulomb",ir->rcoulomb);
 +    PS("vdwtype",EVDWTYPE(ir->vdwtype));
 +    PS("vdw-modifier",INTMODIFIER(ir->vdw_modifier));
 +    PR("rvdw-switch",ir->rvdw_switch);
 +    PR("rvdw",ir->rvdw);
 +    if (ir->epsilon_r != 0)
 +      PR("epsilon-r",ir->epsilon_r);
 +    else
 +      PS("epsilon-r",infbuf);
 +    if (ir->epsilon_rf != 0)
 +      PR("epsilon-rf",ir->epsilon_rf);
 +    else
 +      PS("epsilon-rf",infbuf);
 +    PR("tabext",ir->tabext);
 +    PS("implicit-solvent",EIMPLICITSOL(ir->implicit_solvent));
 +    PS("gb-algorithm",EGBALGORITHM(ir->gb_algorithm));
 +    PR("gb-epsilon-solvent",ir->gb_epsilon_solvent);
 +    PI("nstgbradii",ir->nstgbradii);
 +    PR("rgbradii",ir->rgbradii);
 +    PR("gb-saltconc",ir->gb_saltconc);
 +    PR("gb-obc-alpha",ir->gb_obc_alpha);
 +    PR("gb-obc-beta",ir->gb_obc_beta);
 +    PR("gb-obc-gamma",ir->gb_obc_gamma);
 +    PR("gb-dielectric-offset",ir->gb_dielectric_offset);
 +    PS("sa-algorithm",ESAALGORITHM(ir->gb_algorithm));
 +    PR("sa-surface-tension",ir->sa_surface_tension);
 +    PS("DispCorr",EDISPCORR(ir->eDispCorr));
 +    PS("bSimTemp",EBOOL(ir->bSimTemp));
 +    if (ir->bSimTemp) {
 +        pr_simtempvals(fp,indent,ir->simtempvals,ir->fepvals->n_lambda,bMDPformat);
 +    }
 +    PS("free-energy",EFEPTYPE(ir->efep));
 +    if (ir->efep != efepNO || ir->bSimTemp) {
 +        pr_fepvals(fp,indent,ir->fepvals,bMDPformat);
 +    }
 +    if (ir->bExpanded) {
 +        pr_expandedvals(fp,indent,ir->expandedvals,ir->fepvals->n_lambda,bMDPformat);
 +    }
 +
 +    PI("nwall",ir->nwall);
 +    PS("wall-type",EWALLTYPE(ir->wall_type));
 +    PI("wall-atomtype[0]",ir->wall_atomtype[0]);
 +    PI("wall-atomtype[1]",ir->wall_atomtype[1]);
 +    PR("wall-density[0]",ir->wall_density[0]);
 +    PR("wall-density[1]",ir->wall_density[1]);
 +    PR("wall-ewald-zfac",ir->wall_ewald_zfac);
 +
 +    PS("pull",EPULLTYPE(ir->ePull));
 +    if (ir->ePull != epullNO)
 +      pr_pull(fp,indent,ir->pull);
 +    
 +    PS("rotation",EBOOL(ir->bRot));
 +    if (ir->bRot)
 +      pr_rot(fp,indent,ir->rot);
 +
 +    PS("disre",EDISRETYPE(ir->eDisre));
 +    PS("disre-weighting",EDISREWEIGHTING(ir->eDisreWeighting));
 +    PS("disre-mixed",EBOOL(ir->bDisreMixed));
 +    PR("dr-fc",ir->dr_fc);
 +    PR("dr-tau",ir->dr_tau);
 +    PR("nstdisreout",ir->nstdisreout);
 +    PR("orires-fc",ir->orires_fc);
 +    PR("orires-tau",ir->orires_tau);
 +    PR("nstorireout",ir->nstorireout);
 +
 +    PR("dihre-fc",ir->dihre_fc);
 +    
 +    PR("em-stepsize",ir->em_stepsize);
 +    PR("em-tol",ir->em_tol);
 +    PI("niter",ir->niter);
 +    PR("fc-stepsize",ir->fc_stepsize);
 +    PI("nstcgsteep",ir->nstcgsteep);
 +    PI("nbfgscorr",ir->nbfgscorr);
 +
 +    PS("ConstAlg",ECONSTRTYPE(ir->eConstrAlg));
 +    PR("shake-tol",ir->shake_tol);
 +    PI("lincs-order",ir->nProjOrder);
 +    PR("lincs-warnangle",ir->LincsWarnAngle);
 +    PI("lincs-iter",ir->nLincsIter);
 +    PR("bd-fric",ir->bd_fric);
 +    PI("ld-seed",ir->ld_seed);
 +    PR("cos-accel",ir->cos_accel);
 +    pr_matrix(fp,indent,"deform",ir->deform,bMDPformat);
 +
 +    PS("adress",EBOOL(ir->bAdress));
 +    if (ir->bAdress){
 +        PS("adress_type",EADRESSTYPE(ir->adress->type));
 +        PR("adress_const_wf",ir->adress->const_wf);
 +        PR("adress_ex_width",ir->adress->ex_width);
 +        PR("adress_hy_width",ir->adress->hy_width);
 +        PS("adress_interface_correction",EADRESSICTYPE(ir->adress->icor));
 +        PS("adress_site",EADRESSSITETYPE(ir->adress->site));
 +        PR("adress_ex_force_cap",ir->adress->ex_forcecap);
 +        PS("adress_do_hybridpairs", EBOOL(ir->adress->do_hybridpairs));
 +
 +        pr_rvec(fp,indent,"adress_reference_coords",ir->adress->refs,DIM,TRUE);
 +    }
 +    PI("userint1",ir->userint1);
 +    PI("userint2",ir->userint2);
 +    PI("userint3",ir->userint3);
 +    PI("userint4",ir->userint4);
 +    PR("userreal1",ir->userreal1);
 +    PR("userreal2",ir->userreal2);
 +    PR("userreal3",ir->userreal3);
 +    PR("userreal4",ir->userreal4);
 +    pr_grp_opts(fp,indent,"grpopts",&(ir->opts),bMDPformat);
 +    pr_cosine(fp,indent,"efield-x",&(ir->ex[XX]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-xt",&(ir->et[XX]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-y",&(ir->ex[YY]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-yt",&(ir->et[YY]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-z",&(ir->ex[ZZ]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-zt",&(ir->et[ZZ]),bMDPformat);
 +    PS("bQMMM",EBOOL(ir->bQMMM));
 +    PI("QMconstraints",ir->QMconstraints);
 +    PI("QMMMscheme",ir->QMMMscheme);
 +    PR("scalefactor",ir->scalefactor);
 +    pr_qm_opts(fp,indent,"qm-opts",&(ir->opts));
 +  }
 +}
 +#undef PS
 +#undef PR
 +#undef PI
 +
 +static void pr_harm(FILE *fp,t_iparams *iparams,const char *r,const char *kr)
 +{
 +  fprintf(fp,"%sA=%12.5e, %sA=%12.5e, %sB=%12.5e, %sB=%12.5e\n",
 +        r,iparams->harmonic.rA,kr,iparams->harmonic.krA,
 +        r,iparams->harmonic.rB,kr,iparams->harmonic.krB);
 +}
 +
 +void pr_iparams(FILE *fp,t_functype ftype,t_iparams *iparams)
 +{
 +  int i;
 +  real VA[4],VB[4],*rbcA,*rbcB;
 +
 +  switch (ftype) {
 +  case F_ANGLES:
 +  case F_G96ANGLES:
 +    pr_harm(fp,iparams,"th","ct");
 +    break;
 +  case F_CROSS_BOND_BONDS:
 +    fprintf(fp,"r1e=%15.8e, r2e=%15.8e, krr=%15.8e\n",
 +          iparams->cross_bb.r1e,iparams->cross_bb.r2e,
 +          iparams->cross_bb.krr);
 +    break;
 +  case F_CROSS_BOND_ANGLES:
 +    fprintf(fp,"r1e=%15.8e, r1e=%15.8e, r3e=%15.8e, krt=%15.8e\n",
 +          iparams->cross_ba.r1e,iparams->cross_ba.r2e,
 +          iparams->cross_ba.r3e,iparams->cross_ba.krt);
 +    break;
 +  case F_LINEAR_ANGLES:
 +    fprintf(fp,"klinA=%15.8e, aA=%15.8e, klinB=%15.8e, aB=%15.8e\n",
 +            iparams->linangle.klinA,iparams->linangle.aA,
 +            iparams->linangle.klinB,iparams->linangle.aB);
 +    break;
 +  case F_UREY_BRADLEY:
 +      fprintf(fp,"thetaA=%15.8e, kthetaA=%15.8e, r13A=%15.8e, kUBA=%15.8e, thetaB=%15.8e, kthetaB=%15.8e, r13B=%15.8e, kUBB=%15.8e\n",iparams->u_b.thetaA,iparams->u_b.kthetaA,iparams->u_b.r13A,iparams->u_b.kUBA,iparams->u_b.thetaB,iparams->u_b.kthetaB,iparams->u_b.r13B,iparams->u_b.kUBB);
 +    break;
 +  case F_QUARTIC_ANGLES:
 +    fprintf(fp,"theta=%15.8e",iparams->qangle.theta);
 +    for(i=0; i<5; i++)
 +      fprintf(fp,", c%c=%15.8e",'0'+i,iparams->qangle.c[i]);
 +    fprintf(fp,"\n");
 +    break;
 +  case F_BHAM:
 +    fprintf(fp,"a=%15.8e, b=%15.8e, c=%15.8e\n",
 +          iparams->bham.a,iparams->bham.b,iparams->bham.c);
 +    break;
 +  case F_BONDS:
 +  case F_G96BONDS:
 +  case F_HARMONIC:
 +    pr_harm(fp,iparams,"b0","cb");
 +    break;
 +  case F_IDIHS:
 +    pr_harm(fp,iparams,"xi","cx");
 +    break;
 +  case F_MORSE:
 +    fprintf(fp,"b0A=%15.8e, cbA=%15.8e, betaA=%15.8e, b0B=%15.8e, cbB=%15.8e, betaB=%15.8e\n",
 +            iparams->morse.b0A,iparams->morse.cbA,iparams->morse.betaA,
 +            iparams->morse.b0B,iparams->morse.cbB,iparams->morse.betaB);
 +    break;
 +  case F_CUBICBONDS:
 +    fprintf(fp,"b0=%15.8e, kb=%15.8e, kcub=%15.8e\n",
 +          iparams->cubic.b0,iparams->cubic.kb,iparams->cubic.kcub);
 +    break;
 +  case F_CONNBONDS:
 +    fprintf(fp,"\n");
 +    break;
 +  case F_FENEBONDS:
 +    fprintf(fp,"bm=%15.8e, kb=%15.8e\n",iparams->fene.bm,iparams->fene.kb);
 +    break;
 +  case F_RESTRBONDS:
 +      fprintf(fp,"lowA=%15.8e, up1A=%15.8e, up2A=%15.8e, kA=%15.8e, lowB=%15.8e, up1B=%15.8e, up2B=%15.8e, kB=%15.8e,\n",
 +              iparams->restraint.lowA,iparams->restraint.up1A,
 +              iparams->restraint.up2A,iparams->restraint.kA,
 +              iparams->restraint.lowB,iparams->restraint.up1B,
 +              iparams->restraint.up2B,iparams->restraint.kB);
 +      break;
 +  case F_TABBONDS:
 +  case F_TABBONDSNC:
 +  case F_TABANGLES:
 +  case F_TABDIHS:
 +    fprintf(fp,"tab=%d, kA=%15.8e, kB=%15.8e\n",
 +          iparams->tab.table,iparams->tab.kA,iparams->tab.kB);
 +    break;
 +  case F_POLARIZATION:
 +    fprintf(fp,"alpha=%15.8e\n",iparams->polarize.alpha);
 +    break;
 +  case F_ANHARM_POL:
 +    fprintf(fp,"alpha=%15.8e drcut=%15.8e khyp=%15.8e\n",
 +            iparams->anharm_polarize.alpha,
 +            iparams->anharm_polarize.drcut,
 +            iparams->anharm_polarize.khyp);
 +    break;
 +  case F_THOLE_POL:
 +    fprintf(fp,"a=%15.8e, alpha1=%15.8e, alpha2=%15.8e, rfac=%15.8e\n",
 +          iparams->thole.a,iparams->thole.alpha1,iparams->thole.alpha2,
 +          iparams->thole.rfac);
 +    break;
 +  case F_WATER_POL:
 +    fprintf(fp,"al_x=%15.8e, al_y=%15.8e, al_z=%15.8e, rOH=%9.6f, rHH=%9.6f, rOD=%9.6f\n",
 +          iparams->wpol.al_x,iparams->wpol.al_y,iparams->wpol.al_z,
 +          iparams->wpol.rOH,iparams->wpol.rHH,iparams->wpol.rOD);
 +    break;
 +  case F_LJ:
 +    fprintf(fp,"c6=%15.8e, c12=%15.8e\n",iparams->lj.c6,iparams->lj.c12);
 +    break;
 +  case F_LJ14:
 +    fprintf(fp,"c6A=%15.8e, c12A=%15.8e, c6B=%15.8e, c12B=%15.8e\n",
 +          iparams->lj14.c6A,iparams->lj14.c12A,
 +          iparams->lj14.c6B,iparams->lj14.c12B);
 +    break;
 +  case F_LJC14_Q:
 +    fprintf(fp,"fqq=%15.8e, qi=%15.8e, qj=%15.8e, c6=%15.8e, c12=%15.8e\n",
 +          iparams->ljc14.fqq,
 +          iparams->ljc14.qi,iparams->ljc14.qj,
 +          iparams->ljc14.c6,iparams->ljc14.c12);
 +    break;
 +  case F_LJC_PAIRS_NB:
 +    fprintf(fp,"qi=%15.8e, qj=%15.8e, c6=%15.8e, c12=%15.8e\n",
 +          iparams->ljcnb.qi,iparams->ljcnb.qj,
 +          iparams->ljcnb.c6,iparams->ljcnb.c12);
 +    break;
 +  case F_PDIHS:
 +  case F_PIDIHS:
 +  case F_ANGRES:
 +  case F_ANGRESZ:
 +    fprintf(fp,"phiA=%15.8e, cpA=%15.8e, phiB=%15.8e, cpB=%15.8e, mult=%d\n",
 +          iparams->pdihs.phiA,iparams->pdihs.cpA,
 +          iparams->pdihs.phiB,iparams->pdihs.cpB,
 +          iparams->pdihs.mult);
 +    break;
 +  case F_DISRES:
 +    fprintf(fp,"label=%4d, type=%1d, low=%15.8e, up1=%15.8e, up2=%15.8e, fac=%15.8e)\n",
 +          iparams->disres.label,iparams->disres.type,
 +          iparams->disres.low,iparams->disres.up1,
 +          iparams->disres.up2,iparams->disres.kfac);
 +    break;
 +  case F_ORIRES:
 +    fprintf(fp,"ex=%4d, label=%d, power=%4d, c=%15.8e, obs=%15.8e, kfac=%15.8e)\n",
 +          iparams->orires.ex,iparams->orires.label,iparams->orires.power,
 +          iparams->orires.c,iparams->orires.obs,iparams->orires.kfac);
 +    break;
 +  case F_DIHRES:
 +      fprintf(fp,"phiA=%15.8e, dphiA=%15.8e, kfacA=%15.8e, phiB=%15.8e, dphiB=%15.8e, kfacB=%15.8e\n",
 +              iparams->dihres.phiA,iparams->dihres.dphiA,iparams->dihres.kfacA,
 +              iparams->dihres.phiB,iparams->dihres.dphiB,iparams->dihres.kfacB);
 +    break;
 +  case F_POSRES:
 +    fprintf(fp,"pos0A=(%15.8e,%15.8e,%15.8e), fcA=(%15.8e,%15.8e,%15.8e), pos0B=(%15.8e,%15.8e,%15.8e), fcB=(%15.8e,%15.8e,%15.8e)\n",
 +          iparams->posres.pos0A[XX],iparams->posres.pos0A[YY],
 +          iparams->posres.pos0A[ZZ],iparams->posres.fcA[XX],
 +          iparams->posres.fcA[YY],iparams->posres.fcA[ZZ],
 +          iparams->posres.pos0B[XX],iparams->posres.pos0B[YY],
 +          iparams->posres.pos0B[ZZ],iparams->posres.fcB[XX],
 +          iparams->posres.fcB[YY],iparams->posres.fcB[ZZ]);
 +    break;
 +  case F_FBPOSRES:
 +    fprintf(fp,"pos0=(%15.8e,%15.8e,%15.8e), geometry=%d, r=%15.8e, k=%15.8e\n",
 +        iparams->fbposres.pos0[XX], iparams->fbposres.pos0[YY],
 +        iparams->fbposres.pos0[ZZ], iparams->fbposres.geom,
 +        iparams->fbposres.r,        iparams->fbposres.k);
 +    break;
 +  case F_RBDIHS:
 +    for (i=0; i<NR_RBDIHS; i++) 
 +      fprintf(fp,"%srbcA[%d]=%15.8e",i==0?"":", ",i,iparams->rbdihs.rbcA[i]);
 +    fprintf(fp,"\n");
 +    for (i=0; i<NR_RBDIHS; i++) 
 +      fprintf(fp,"%srbcB[%d]=%15.8e",i==0?"":", ",i,iparams->rbdihs.rbcB[i]);
 +    fprintf(fp,"\n");
 +    break;
 +  case F_FOURDIHS:
 +    /* Use the OPLS -> Ryckaert-Bellemans formula backwards to get the
 +     * OPLS potential constants back.
 +     */
 +    rbcA = iparams->rbdihs.rbcA;
 +    rbcB = iparams->rbdihs.rbcB;
 +
 +    VA[3] = -0.25*rbcA[4];
 +    VA[2] = -0.5*rbcA[3];
 +    VA[1] = 4.0*VA[3]-rbcA[2];
 +    VA[0] = 3.0*VA[2]-2.0*rbcA[1];
 +
 +    VB[3] = -0.25*rbcB[4];
 +    VB[2] = -0.5*rbcB[3];
 +    VB[1] = 4.0*VB[3]-rbcB[2];
 +    VB[0] = 3.0*VB[2]-2.0*rbcB[1];
 +
 +    for (i=0; i<NR_FOURDIHS; i++) 
 +      fprintf(fp,"%sFourA[%d]=%15.8e",i==0?"":", ",i,VA[i]);
 +    fprintf(fp,"\n");
 +    for (i=0; i<NR_FOURDIHS; i++) 
 +      fprintf(fp,"%sFourB[%d]=%15.8e",i==0?"":", ",i,VB[i]);
 +    fprintf(fp,"\n");
 +    break;
 +   
 +  case F_CONSTR:
 +  case F_CONSTRNC:
 +    fprintf(fp,"dA=%15.8e, dB=%15.8e\n",iparams->constr.dA,iparams->constr.dB);
 +    break;
 +  case F_SETTLE:
 +    fprintf(fp,"doh=%15.8e, dhh=%15.8e\n",iparams->settle.doh,
 +          iparams->settle.dhh);
 +    break;
 +  case F_VSITE2:
 +    fprintf(fp,"a=%15.8e\n",iparams->vsite.a);
 +    break;
 +  case F_VSITE3:
 +  case F_VSITE3FD:
 +  case F_VSITE3FAD:
 +    fprintf(fp,"a=%15.8e, b=%15.8e\n",iparams->vsite.a,iparams->vsite.b);
 +    break;
 +  case F_VSITE3OUT:
 +  case F_VSITE4FD:
 +  case F_VSITE4FDN:
 +    fprintf(fp,"a=%15.8e, b=%15.8e, c=%15.8e\n",
 +          iparams->vsite.a,iparams->vsite.b,iparams->vsite.c);
 +    break;
 +  case F_VSITEN:
 +    fprintf(fp,"n=%2d, a=%15.8e\n",iparams->vsiten.n,iparams->vsiten.a);
 +    break;
 +  case F_GB12:
 +  case F_GB13:
 +  case F_GB14:
 +    fprintf(fp, "sar=%15.8e, st=%15.8e, pi=%15.8e, gbr=%15.8e, bmlt=%15.8e\n",iparams->gb.sar,iparams->gb.st,iparams->gb.pi,iparams->gb.gbr,iparams->gb.bmlt);
 +    break;              
 +  case F_CMAP:
 +    fprintf(fp, "cmapA=%1d, cmapB=%1d\n",iparams->cmap.cmapA, iparams->cmap.cmapB);
 +    break;              
 +  default:
 +    gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
 +            ftype,interaction_function[ftype].name,__FILE__,__LINE__);
 +  }
 +}
 +
 +void pr_ilist(FILE *fp,int indent,const char *title,
 +              t_functype *functype,t_ilist *ilist, gmx_bool bShowNumbers)
 +{
 +    int i,j,k,type,ftype;
 +    t_iatom *iatoms;
 +    
 +    if (available(fp,ilist,indent,title) && ilist->nr > 0)
 +    {  
 +        indent=pr_title(fp,indent,title);
 +        (void) pr_indent(fp,indent);
 +        fprintf(fp,"nr: %d\n",ilist->nr);
 +        if (ilist->nr > 0) {
 +            (void) pr_indent(fp,indent);
 +            fprintf(fp,"iatoms:\n");
 +            iatoms=ilist->iatoms;
 +            for (i=j=0; i<ilist->nr;) {
 +#ifndef DEBUG
 +                (void) pr_indent(fp,indent+INDENT);
 +                type=*(iatoms++);
 +                ftype=functype[type];
 +                (void) fprintf(fp,"%d type=%d (%s)",
 +                               bShowNumbers?j:-1,bShowNumbers?type:-1,
 +                               interaction_function[ftype].name);
 +                j++;
 +                for (k=0; k<interaction_function[ftype].nratoms; k++)
 +                    (void) fprintf(fp," %u",*(iatoms++));
 +                (void) fprintf(fp,"\n");
 +                i+=1+interaction_function[ftype].nratoms;
 +#else
 +                fprintf(fp,"%5d%5d\n",i,iatoms[i]);
 +                i++;
 +#endif
 +            }
 +        }
 +    }
 +}
 +
 +static void pr_cmap(FILE *fp, int indent, const char *title,
 +                    gmx_cmap_t *cmap_grid, gmx_bool bShowNumbers)
 +{
 +    int i,j,nelem;
 +    real dx,idx;
 +      
 +    dx    = 360.0 / cmap_grid->grid_spacing;
 +    nelem = cmap_grid->grid_spacing*cmap_grid->grid_spacing;
 +      
 +    if(available(fp,cmap_grid,indent,title))
 +    {
 +        fprintf(fp,"%s\n",title);
 +              
 +        for(i=0;i<cmap_grid->ngrid;i++)
 +        {
 +            idx = -180.0;
 +            fprintf(fp,"%8s %8s %8s %8s\n","V","dVdx","dVdy","d2dV");
 +                      
 +            fprintf(fp,"grid[%3d]={\n",bShowNumbers?i:-1);
 +                      
 +            for(j=0;j<nelem;j++)
 +            {
 +                if( (j%cmap_grid->grid_spacing)==0)
 +                {
 +                    fprintf(fp,"%8.1f\n",idx);
 +                    idx+=dx;
 +                }
 +                              
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4]);
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4+1]);
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4+2]);
 +                fprintf(fp,"%8.3f\n",cmap_grid->cmapdata[i].cmap[j*4+3]);
 +            }
 +            fprintf(fp,"\n");
 +        }
 +    }
 +      
 +}
 +
 +void pr_ffparams(FILE *fp,int indent,const char *title,
 +                 gmx_ffparams_t *ffparams,
 +                 gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +  
 +  indent=pr_title(fp,indent,title);
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"atnr=%d\n",ffparams->atnr);
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"ntypes=%d\n",ffparams->ntypes);
 +  for (i=0; i<ffparams->ntypes; i++) {
 +      (void) pr_indent(fp,indent+INDENT);
 +      (void) fprintf(fp,"functype[%d]=%s, ",
 +                     bShowNumbers?i:-1,
 +                     interaction_function[ffparams->functype[i]].name);
 +      pr_iparams(fp,ffparams->functype[i],&ffparams->iparams[i]);
 +  }
 +  (void) pr_double(fp,indent,"reppow",ffparams->reppow);
 +  (void) pr_real(fp,indent,"fudgeQQ",ffparams->fudgeQQ);
 +  pr_cmap(fp,indent,"cmap",&ffparams->cmap_grid,bShowNumbers);
 +}
 +
 +void pr_idef(FILE *fp,int indent,const char *title,t_idef *idef, gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +  
 +  if (available(fp,idef,indent,title)) {  
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"atnr=%d\n",idef->atnr);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"ntypes=%d\n",idef->ntypes);
 +    for (i=0; i<idef->ntypes; i++) {
 +      (void) pr_indent(fp,indent+INDENT);
 +      (void) fprintf(fp,"functype[%d]=%s, ",
 +                   bShowNumbers?i:-1,
 +                   interaction_function[idef->functype[i]].name);
 +      pr_iparams(fp,idef->functype[i],&idef->iparams[i]);
 +    }
 +    (void) pr_real(fp,indent,"fudgeQQ",idef->fudgeQQ);
 +
 +    for(j=0; (j<F_NRE); j++)
 +      pr_ilist(fp,indent,interaction_function[j].longname,
 +               idef->functype,&idef->il[j],bShowNumbers);
 +  }
 +}
 +
 +static int pr_block_title(FILE *fp,int indent,const char *title,t_block *block)
 +{
 +  int i;
 +
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nr=%d\n",block->nr);
 +    }
 +  return indent;
 +}
 +
 +static int pr_blocka_title(FILE *fp,int indent,const char *title,t_blocka *block)
 +{
 +  int i;
 +
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nr=%d\n",block->nr);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nra=%d\n",block->nra);
 +    }
 +  return indent;
 +}
 +
 +static void low_pr_blocka(FILE *fp,int indent,const char *title,t_blocka *block, gmx_bool bShowNumbers)
 +{
 +  int i;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_blocka_title(fp,indent,title,block);
 +      for (i=0; i<=block->nr; i++)
 +        {
 +          (void) pr_indent(fp,indent+INDENT);
 +          (void) fprintf(fp,"%s->index[%d]=%u\n",
 +                       title,bShowNumbers?i:-1,block->index[i]);
 +        }
 +      for (i=0; i<block->nra; i++)
 +        {
 +          (void) pr_indent(fp,indent+INDENT);
 +          (void) fprintf(fp,"%s->a[%d]=%u\n",
 +                       title,bShowNumbers?i:-1,block->a[i]);
 +        }
 +    }
 +}
 +
 +void pr_block(FILE *fp,int indent,const char *title,t_block *block,gmx_bool bShowNumbers)
 +{
 +  int i,j,ok,size,start,end;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_block_title(fp,indent,title,block);
 +      start=0;
 +      end=start;
 +      if ((ok=(block->index[start]==0))==0)
 +        (void) fprintf(fp,"block->index[%d] should be 0\n",start);
 +      else
 +        for (i=0; i<block->nr; i++)
 +          {
 +            end=block->index[i+1];
 +            size=pr_indent(fp,indent);
 +            if (end<=start)
 +              size+=fprintf(fp,"%s[%d]={}\n",title,i);
 +            else
 +              size+=fprintf(fp,"%s[%d]={%d..%d}\n",
 +                          title,bShowNumbers?i:-1,
 +                          bShowNumbers?start:-1,bShowNumbers?end-1:-1);
 +            start=end;
 +          }
 +    }
 +}
 +
 +void pr_blocka(FILE *fp,int indent,const char *title,t_blocka *block,gmx_bool bShowNumbers)
 +{
 +  int i,j,ok,size,start,end;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_blocka_title(fp,indent,title,block);
 +      start=0;
 +      end=start;
 +      if ((ok=(block->index[start]==0))==0)
 +        (void) fprintf(fp,"block->index[%d] should be 0\n",start);
 +      else
 +        for (i=0; i<block->nr; i++)
 +          {
 +            end=block->index[i+1];
 +            size=pr_indent(fp,indent);
 +            if (end<=start)
 +              size+=fprintf(fp,"%s[%d]={",title,i);
 +            else
 +              size+=fprintf(fp,"%s[%d][%d..%d]={",
 +                          title,bShowNumbers?i:-1,
 +                          bShowNumbers?start:-1,bShowNumbers?end-1:-1);
 +            for (j=start; j<end; j++)
 +              {
 +                if (j>start) size+=fprintf(fp,", ");
 +                if ((size)>(USE_WIDTH))
 +                  {
 +                    (void) fprintf(fp,"\n");
 +                    size=pr_indent(fp,indent+INDENT);
 +                  }
 +                size+=fprintf(fp,"%u",block->a[j]);
 +              }
 +            (void) fprintf(fp,"}\n");
 +            start=end;
 +          }
 +      if ((end!=block->nra)||(!ok)) 
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"tables inconsistent, dumping complete tables:\n");
 +          low_pr_blocka(fp,indent,title,block,bShowNumbers);
 +        }
 +    }
 +}
 +
 +static void pr_strings(FILE *fp,int indent,const char *title,char ***nm,int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,nm,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={name=\"%s\"}\n",
 +                       title,bShowNumbers?i:-1,*(nm[i]));
 +        }
 +    }
 +}
 +
 +static void pr_strings2(FILE *fp,int indent,const char *title,
 +                      char ***nm,char ***nmB,int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,nm,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={name=\"%s\",nameB=\"%s\"}\n",
 +                       title,bShowNumbers?i:-1,*(nm[i]),*(nmB[i]));
 +        }
 +    }
 +}
 +
 +static void pr_resinfo(FILE *fp,int indent,const char *title,t_resinfo *resinfo,int n, gmx_bool bShowNumbers)
 +{
 +    int i;
 +    
 +    if (available(fp,resinfo,indent,title))
 +    {  
 +        indent=pr_title_n(fp,indent,title,n);
 +        for (i=0; i<n; i++)
 +        {
 +            (void) pr_indent(fp,indent);
 +            (void) fprintf(fp,"%s[%d]={name=\"%s\", nr=%d, ic='%c'}\n",
 +                           title,bShowNumbers?i:-1,
 +                           *(resinfo[i].name),resinfo[i].nr,
 +                           (resinfo[i].ic == '\0') ? ' ' : resinfo[i].ic);
 +        }
 +    }
 +}
 +
 +static void pr_atom(FILE *fp,int indent,const char *title,t_atom *atom,int n)
 +{
 +  int i,j;
 +  
 +  if (available(fp,atom,indent,title)) {  
 +    indent=pr_title_n(fp,indent,title,n);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      fprintf(fp,"%s[%6d]={type=%3d, typeB=%3d, ptype=%8s, m=%12.5e, "
 +              "q=%12.5e, mB=%12.5e, qB=%12.5e, resind=%5d, atomnumber=%3d}\n",
 +              title,i,atom[i].type,atom[i].typeB,ptype_str[atom[i].ptype],
 +              atom[i].m,atom[i].q,atom[i].mB,atom[i].qB,
 +              atom[i].resind,atom[i].atomnumber);
 +    }
 +  }
 +}
 +
 +static void pr_grps(FILE *fp,int indent,const char *title,t_grps grps[],
 +                  char **grpname[], gmx_bool bShowNumbers)
 +{
 +    int i,j;
 +
 +    for(i=0; (i<egcNR); i++)
 +    {
 +        fprintf(fp,"%s[%-12s] nr=%d, name=[",title,gtypes[i],grps[i].nr);
 +        for(j=0; (j<grps[i].nr); j++)
 +        {
 +            fprintf(fp," %s",*(grpname[grps[i].nm_ind[j]]));
 +        }
 +        fprintf(fp,"]\n");
 +    }
 +}
 +
 +static void pr_groups(FILE *fp,int indent,const char *title,
 +                      gmx_groups_t *groups,
 +                      gmx_bool bShowNumbers)
 +{
 +    int grpnr[egcNR];
 +    int nat_max,i,g;
 +
 +    pr_grps(fp,indent,"grp",groups->grps,groups->grpname,bShowNumbers);
 +    pr_strings(fp,indent,"grpname",groups->grpname,groups->ngrpname,bShowNumbers);
 +
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"groups          ");
 +    for(g=0; g<egcNR; g++)
 +    {
 +       printf(" %5.5s",gtypes[g]);
 +    }
 +    printf("\n");
 +
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"allocated       ");
 +    nat_max = 0;
 +    for(g=0; g<egcNR; g++)
 +    {
 +        printf(" %5d",groups->ngrpnr[g]);
 +        nat_max = max(nat_max,groups->ngrpnr[g]);
 +    }
 +    printf("\n");
 +
 +    if (nat_max == 0)
 +    {
 +        (void) pr_indent(fp,indent);
 +        fprintf(fp,"groupnr[%5s] =","*");
 +        for(g=0; g<egcNR; g++)
 +        {
 +            fprintf(fp,"  %3d ",0);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +    else
 +    {
 +        for(i=0; i<nat_max; i++)
 +        {
 +            (void) pr_indent(fp,indent);
 +            fprintf(fp,"groupnr[%5d] =",i);
 +            for(g=0; g<egcNR; g++)
 +            {
 +                fprintf(fp,"  %3d ",
 +                        groups->grpnr[g] ? groups->grpnr[g][i] : 0);
 +            }
 +            fprintf(fp,"\n");
 +        }
 +    }
 +}
 +
 +void pr_atoms(FILE *fp,int indent,const char *title,t_atoms *atoms, 
 +            gmx_bool bShownumbers)
 +{
 +  if (available(fp,atoms,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      pr_atom(fp,indent,"atom",atoms->atom,atoms->nr);
 +      pr_strings(fp,indent,"atom",atoms->atomname,atoms->nr,bShownumbers);
 +      pr_strings2(fp,indent,"type",atoms->atomtype,atoms->atomtypeB,atoms->nr,bShownumbers);
 +      pr_resinfo(fp,indent,"residue",atoms->resinfo,atoms->nres,bShownumbers);
 +    }
 +}
 +
 +
 +void pr_atomtypes(FILE *fp,int indent,const char *title,t_atomtypes *atomtypes, 
 +                gmx_bool bShowNumbers)
 +{
 +  int i;
 +  if (available(fp,atomtypes,indent,title)) 
 +  {
 +    indent=pr_title(fp,indent,title);
 +    for(i=0;i<atomtypes->nr;i++) {
 +      pr_indent(fp,indent);
 +              fprintf(fp,
 +                              "atomtype[%3d]={radius=%12.5e, volume=%12.5e, gb_radius=%12.5e, surftens=%12.5e, atomnumber=%4d, S_hct=%12.5e)}\n",
 +                              bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
 +                              atomtypes->gb_radius[i],
 +                              atomtypes->surftens[i],atomtypes->atomnumber[i],atomtypes->S_hct[i]);
 +    }
 +  }
 +}
 +
 +static void pr_moltype(FILE *fp,int indent,const char *title,
 +                       gmx_moltype_t *molt,int n,
 +                       gmx_ffparams_t *ffparams,
 +                       gmx_bool bShowNumbers)
 +{
 +    int j;
 +
 +    indent = pr_title_n(fp,indent,title,n);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"name=\"%s\"\n",*(molt->name));
 +    pr_atoms(fp,indent,"atoms",&(molt->atoms),bShowNumbers);
 +    pr_block(fp,indent,"cgs",&molt->cgs, bShowNumbers);
 +    pr_blocka(fp,indent,"excls",&molt->excls, bShowNumbers);
 +    for(j=0; (j<F_NRE); j++) {
 +        pr_ilist(fp,indent,interaction_function[j].longname,
 +                 ffparams->functype,&molt->ilist[j],bShowNumbers);
 +    }
 +}
 +
 +static void pr_molblock(FILE *fp,int indent,const char *title,
 +                        gmx_molblock_t *molb,int n,
 +                        gmx_moltype_t *molt,
 +                        gmx_bool bShowNumbers)
 +{
 +    indent = pr_title_n(fp,indent,title,n);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%-20s = %d \"%s\"\n",
 +                   "moltype",molb->type,*(molt[molb->type].name));
 +    pr_int(fp,indent,"#molecules",molb->nmol);
 +    pr_int(fp,indent,"#atoms_mol",molb->natoms_mol);
 +    pr_int(fp,indent,"#posres_xA",molb->nposres_xA);
 +    if (molb->nposres_xA > 0) {
 +        pr_rvecs(fp,indent,"posres_xA",molb->posres_xA,molb->nposres_xA);
 +    }
 +    pr_int(fp,indent,"#posres_xB",molb->nposres_xB);
 +    if (molb->nposres_xB > 0) {
 +        pr_rvecs(fp,indent,"posres_xB",molb->posres_xB,molb->nposres_xB);
 +    }
 +}
 +
 +void pr_mtop(FILE *fp,int indent,const char *title,gmx_mtop_t *mtop,
 +             gmx_bool bShowNumbers)
 +{
 +    int mt,mb;
 +
 +    if (available(fp,mtop,indent,title)) {
 +        indent=pr_title(fp,indent,title);
 +        (void) pr_indent(fp,indent);
 +        (void) fprintf(fp,"name=\"%s\"\n",*(mtop->name));
 +        pr_int(fp,indent,"#atoms",mtop->natoms);
 +        pr_int(fp,indent,"#molblock",mtop->nmolblock);
 +        for(mb=0; mb<mtop->nmolblock; mb++) {
 +            pr_molblock(fp,indent,"molblock",&mtop->molblock[mb],mb,
 +                        mtop->moltype,bShowNumbers);
 +        }
 +        pr_ffparams(fp,indent,"ffparams",&(mtop->ffparams),bShowNumbers);
 +        pr_atomtypes(fp,indent,"atomtypes",&(mtop->atomtypes),bShowNumbers);
 +        for(mt=0; mt<mtop->nmoltype; mt++) {
 +            pr_moltype(fp,indent,"moltype",&mtop->moltype[mt],mt,
 +                       &mtop->ffparams,bShowNumbers);
 +        }
 +        pr_groups(fp,indent,"groups",&mtop->groups,bShowNumbers);
 +    }
 +}
 +
 +void pr_top(FILE *fp,int indent,const char *title,t_topology *top, gmx_bool bShowNumbers)
 +{
 +  if (available(fp,top,indent,title)) {
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"name=\"%s\"\n",*(top->name));
 +    pr_atoms(fp,indent,"atoms",&(top->atoms),bShowNumbers);
 +    pr_atomtypes(fp,indent,"atomtypes",&(top->atomtypes),bShowNumbers);
 +    pr_block(fp,indent,"cgs",&top->cgs, bShowNumbers);
 +    pr_block(fp,indent,"mols",&top->mols, bShowNumbers);
 +    pr_blocka(fp,indent,"excls",&top->excls, bShowNumbers);
 +    pr_idef(fp,indent,"idef",&top->idef,bShowNumbers);
 +  }
 +}
 +
 +void pr_header(FILE *fp,int indent,const char *title,t_tpxheader *sh)
 +{
 +  char buf[22];
 +    
 +  if (available(fp,sh,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bIr    = %spresent\n",sh->bIr?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bBox   = %spresent\n",sh->bBox?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bTop   = %spresent\n",sh->bTop?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bX     = %spresent\n",sh->bX?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bV     = %spresent\n",sh->bV?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bF     = %spresent\n",sh->bF?"":"not ");
 +      
 +      pr_indent(fp,indent);
 +      fprintf(fp,"natoms = %d\n",sh->natoms);
 +      pr_indent(fp,indent);
 +      fprintf(fp,"lambda = %e\n",sh->lambda);
 +    }
 +}
 +
 +void pr_commrec(FILE *fp,int indent,t_commrec *cr)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"commrec:\n");
 +  indent+=2;
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nodeid    = %d\n",cr->nodeid);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nnodes    = %d\n",cr->nnodes);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"npmenodes = %d\n",cr->npmenodes);
 +  /*
 +  pr_indent(fp,indent);
 +  fprintf(fp,"threadid  = %d\n",cr->threadid);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nthreads  = %d\n",cr->nthreads);
 +  */
 +}
index da20b6700cdffbc1eea3dc902f393f1119f5060e,0000000000000000000000000000000000000000..68f9ce294b8216a474414501f643cd7a4ea8de99
mode 100644,000000..100644
--- /dev/null
@@@ -1,3458 -1,0 +1,3516 @@@
-         else if (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include <stdlib.h>
 +#include <limits.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "symtab.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "warninp.h"
 +#include "readir.h" 
 +#include "toputil.h"
 +#include "index.h"
 +#include "network.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "inputrec.h"
 +
 +#define MAXPTR 254
 +#define NOGID  255
 +#define MAXLAMBDAS 1024
 +
 +/* Resource parameters 
 + * Do not change any of these until you read the instruction
 + * in readinp.h. Some cpp's do not take spaces after the backslash
 + * (like the c-shell), which will give you a very weird compiler
 + * message.
 + */
 +
 +static char tcgrps[STRLEN],tau_t[STRLEN],ref_t[STRLEN],
 +  acc[STRLEN],accgrps[STRLEN],freeze[STRLEN],frdim[STRLEN],
 +  energy[STRLEN],user1[STRLEN],user2[STRLEN],vcm[STRLEN],xtc_grps[STRLEN],
 +  couple_moltype[STRLEN],orirefitgrp[STRLEN],egptable[STRLEN],egpexcl[STRLEN],
 +  wall_atomtype[STRLEN],wall_density[STRLEN],deform[STRLEN],QMMM[STRLEN];
 +static char fep_lambda[efptNR][STRLEN];
 +static char lambda_weights[STRLEN];
 +static char **pull_grp;
 +static char **rot_grp;
 +static char anneal[STRLEN],anneal_npoints[STRLEN],
 +  anneal_time[STRLEN],anneal_temp[STRLEN];
 +static char QMmethod[STRLEN],QMbasis[STRLEN],QMcharge[STRLEN],QMmult[STRLEN],
 +  bSH[STRLEN],CASorbitals[STRLEN], CASelectrons[STRLEN],SAon[STRLEN],
 +  SAoff[STRLEN],SAsteps[STRLEN],bTS[STRLEN],bOPT[STRLEN]; 
 +static char efield_x[STRLEN],efield_xt[STRLEN],efield_y[STRLEN],
 +  efield_yt[STRLEN],efield_z[STRLEN],efield_zt[STRLEN];
 +
 +enum {
 +    egrptpALL,         /* All particles have to be a member of a group.     */
 +    egrptpALL_GENREST, /* A rest group with name is generated for particles *
 +                        * that are not part of any group.                   */
 +    egrptpPART,        /* As egrptpALL_GENREST, but no name is generated    *
 +                        * for the rest group.                               */
 +    egrptpONE          /* Merge all selected groups into one group,         *
 +                        * make a rest group for the remaining particles.    */
 +};
 +
 +
 +void init_ir(t_inputrec *ir, t_gromppopts *opts)
 +{
 +  snew(opts->include,STRLEN); 
 +  snew(opts->define,STRLEN);
 +  snew(ir->fepvals,1);
 +  snew(ir->expandedvals,1);
 +  snew(ir->simtempvals,1);
 +}
 +
 +static void GetSimTemps(int ntemps, t_simtemp *simtemp, double *temperature_lambdas)
 +{
 +
 +    int i;
 +
 +    for (i=0;i<ntemps;i++)
 +    {
 +        /* simple linear scaling -- allows more control */
 +        if (simtemp->eSimTempScale == esimtempLINEAR)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*temperature_lambdas[i];
 +        }
 +        else if (simtemp->eSimTempScale == esimtempGEOMETRIC)  /* should give roughly equal acceptance for constant heat capacity . . . */
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low * pow(simtemp->simtemp_high/simtemp->simtemp_low,(1.0*i)/(ntemps-1));
 +        }
 +        else if (simtemp->eSimTempScale == esimtempEXPONENTIAL)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*((exp(temperature_lambdas[i])-1)/(exp(1.0)-1));
 +        }
 +        else
 +        {
 +            char errorstr[128];
 +            sprintf(errorstr,"eSimTempScale=%d not defined",simtemp->eSimTempScale);
 +            gmx_fatal(FARGS,errorstr);
 +        }
 +    }
 +}
 +
 +
 +
 +static void _low_check(gmx_bool b,char *s,warninp_t wi)
 +{
 +    if (b)
 +    {
 +        warning_error(wi,s);
 +    }
 +}
 +
 +static void check_nst(const char *desc_nst,int nst,
 +                      const char *desc_p,int *p,
 +                      warninp_t wi)
 +{
 +    char buf[STRLEN];
 +
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        sprintf(buf,"%s should be a multiple of %s, changing %s to %d\n",
 +              desc_p,desc_nst,desc_p,*p);
 +        warning(wi,buf);
 +    }
 +}
 +
 +static gmx_bool ir_NVE(const t_inputrec *ir)
 +{
 +    return ((ir->eI == eiMD || EI_VV(ir->eI)) && ir->etc == etcNO);
 +}
 +
 +static int lcd(int n1,int n2)
 +{
 +    int d,i;
 +    
 +    d = 1;
 +    for(i=2; (i<=n1 && i<=n2); i++)
 +    {
 +        if (n1 % i == 0 && n2 % i == 0)
 +        {
 +            d = i;
 +        }
 +    }
 +    
 +  return d;
 +}
 +
 +static void process_interaction_modifier(const t_inputrec *ir,int *eintmod)
 +{
 +    if (*eintmod == eintmodPOTSHIFT_VERLET)
 +    {
 +        if (ir->cutoff_scheme == ecutsVERLET)
 +        {
 +            *eintmod = eintmodPOTSHIFT;
 +        }
 +        else
 +        {
 +            *eintmod = eintmodNONE;
 +        }
 +    }
 +}
 +
 +void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
 +              warninp_t wi)
 +/* Check internal consistency */
 +{
 +    /* Strange macro: first one fills the err_buf, and then one can check 
 +     * the condition, which will print the message and increase the error
 +     * counter.
 +     */
 +#define CHECK(b) _low_check(b,err_buf,wi)
 +    char err_buf[256],warn_buf[STRLEN];
 +    int i,j;
 +    int  ns_type=0;
 +    real dt_coupl=0;
 +    real dt_pcoupl;
 +    int  nstcmin;
 +    t_lambda *fep = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +    /* BASIC CUT-OFF STUFF */
 +    if (ir->rcoulomb < 0)
 +    {
 +        warning_error(wi,"rcoulomb should be >= 0");
 +    }
 +    if (ir->rvdw < 0)
 +    {
 +        warning_error(wi,"rvdw should be >= 0");
 +    }
 +    if (ir->rlist < 0 &&
 +        !(ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0))
 +    {
 +        warning_error(wi,"rlist should be >= 0");
 +    }
 +
 +    process_interaction_modifier(ir,&ir->coulomb_modifier);
 +    process_interaction_modifier(ir,&ir->vdw_modifier);
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* BASIC CUT-OFF STUFF */
 +        if (ir->rlist == 0 ||
 +            !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
 +              (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist))) {
 +            /* No switched potential and/or no twin-range:
 +             * we can set the long-range cut-off to the maximum of the other cut-offs.
 +             */
 +            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +        }
 +        else if (ir->rlistlong < 0)
 +        {
 +            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +            sprintf(warn_buf,"rlistlong was not set, setting it to %g (no buffer)",
 +                    ir->rlistlong);
 +            warning(wi,warn_buf);
 +        }
 +        if (ir->rlistlong == 0 && ir->ePBC != epbcNONE)
 +        {
 +            warning_error(wi,"Can not have an infinite cut-off with PBC");
 +        }
 +        if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist))
 +        {
 +            warning_error(wi,"rlistlong can not be shorter than rlist");
 +        }
 +        if (IR_TWINRANGE(*ir) && ir->nstlist <= 0)
 +        {
 +            warning_error(wi,"Can not have nstlist<=0 with twin-range interactions");
 +        }
 +    }
 +    
 +    if(ir->rlistlong == ir->rlist)
 +    {
 +        ir->nstcalclr = 0;
 +    }
 +    else if(ir->rlistlong>ir->rlist && ir->nstcalclr==0)
 +    {
 +        warning_error(wi,"With different cutoffs for electrostatics and VdW, nstcalclr must be -1 or a positive number");
 +    }
 +    
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        real rc_max;
 +
 +        /* Normal Verlet type neighbor-list, currently only limited feature support */
 +        if (inputrec2nboundeddim(ir) < 3)
 +        {
 +            warning_error(wi,"With Verlet lists only full pbc or pbc=xy with walls is supported");
 +        }
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            warning_error(wi,"With Verlet lists rcoulomb!=rvdw is not supported");
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            warning_error(wi,"With Verlet lists only cut-off LJ interactions are supported");
 +        }
 +        if (!(ir->coulombtype == eelCUT ||
 +              (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC) ||
 +              EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD))
 +        {
 +            warning_error(wi,"With Verlet lists only cut-off, reaction-field, PME and Ewald electrostatics are supported");
 +        }
 +
 +        if (ir->nstlist <= 0)
 +        {
 +             warning_error(wi,"With Verlet lists nstlist should be larger than 0");
 +        }
 +
 +        if (ir->nstlist < 10)
 +        {
 +            warning_note(wi,"With Verlet lists the optimal nstlist is >= 10, with GPUs >= 20. Note that with the Verlet scheme, nstlist has no effect on the accuracy of your simulation.");
 +        }
 +
 +        rc_max = max(ir->rvdw,ir->rcoulomb);
 +
 +        if (ir->verletbuf_drift <= 0)
 +        {
 +            if (ir->verletbuf_drift == 0)
 +            {
 +                warning_error(wi,"Can not have an energy drift of exactly 0");
 +            }
 +
 +            if (ir->rlist < rc_max)
 +            {
 +                warning_error(wi,"With verlet lists rlist can not be smaller than rvdw or rcoulomb");
 +            }
 +            
 +            if (ir->rlist == rc_max && ir->nstlist > 1)
 +            {
 +                warning_note(wi,"rlist is equal to rvdw and/or rcoulomb: there is no explicit Verlet buffer. The cluster pair list does have a buffering effect, but choosing a larger rlist might be necessary for good energy conservation.");
 +            }
 +        }
 +        else
 +        {
 +            if (ir->rlist > rc_max)
 +            {
 +                warning_note(wi,"You have set rlist larger than the interaction cut-off, but you also have verlet-buffer-drift > 0. Will set rlist using verlet-buffer-drift.");
 +            }
 +
 +            if (ir->nstlist == 1)
 +            {
 +                /* No buffer required */
 +                ir->rlist = rc_max;
 +            }
 +            else
 +            {
 +                if (EI_DYNAMICS(ir->eI))
 +                {
 +                    if (EI_MD(ir->eI) && ir->etc == etcNO)
 +                    {
 +                        warning_error(wi,"Temperature coupling is required for calculating rlist using the energy drift with verlet-buffer-drift > 0. Either use temperature coupling or set rlist yourself together with verlet-buffer-drift = -1."); 
 +                    }
 +
 +                    if (inputrec2nboundeddim(ir) < 3)
 +                    {
 +                        warning_error(wi,"The box volume is required for calculating rlist from the energy drift with verlet-buffer-drift > 0. You are using at least one unbounded dimension, so no volume can be computed. Either use a finite box, or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +                    /* Set rlist temporarily so we can continue processing */
 +                    ir->rlist = rc_max;
 +                }
 +                else
 +                {
 +                    /* Set the buffer to 5% of the cut-off */
 +                    ir->rlist = 1.05*rc_max;
 +                }
 +            }
 +        }
 +
 +        /* No twin-range calculations with Verlet lists */
 +        ir->rlistlong = ir->rlist;
 +    }
 +
 +    if(ir->nstcalclr==-1)
 +    {
 +        /* if rlist=rlistlong, this will later be changed to nstcalclr=0 */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    else if(ir->nstcalclr>0)
 +    {
 +        if(ir->nstlist>0 && (ir->nstlist % ir->nstcalclr != 0))
 +        {
 +            warning_error(wi,"nstlist must be evenly divisible by nstcalclr. Use nstcalclr = -1 to automatically follow nstlist");
 +        }
 +    }
 +    else if(ir->nstcalclr<-1)
 +    {
 +        warning_error(wi,"nstcalclr must be a positive number (divisor of nstcalclr), or -1 to follow nstlist.");
 +    }
 +    
 +    if(EEL_PME(ir->coulombtype) && ir->rcoulomb > ir->rvdw && ir->nstcalclr>1)
 +    {
 +        warning_error(wi,"When used with PME, the long-range component of twin-range interactions must be updated every step (nstcalclr)");
 +    }
 +       
 +    /* GENERAL INTEGRATOR STUFF */
 +    if (!(ir->eI == eiMD || EI_VV(ir->eI)))
 +    {
 +        ir->etc = etcNO;
 +    }
 +    if (ir->eI == eiVVAK) {
 +        sprintf(warn_buf,"Integrator method %s is implemented primarily for validation purposes; for molecular dynamics, you should probably be using %s or %s",ei_names[eiVVAK],ei_names[eiMD],ei_names[eiVV]);
 +        warning_note(wi,warn_buf);
 +    }
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        ir->epc = epcNO;
 +    }
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        if (ir->nstcalcenergy < 0)
 +        {
 +            ir->nstcalcenergy = ir_optimal_nstcalcenergy(ir);
 +            if (ir->nstenergy != 0 && ir->nstenergy < ir->nstcalcenergy)
 +            {
 +                /* nstcalcenergy larger than nstener does not make sense.
 +                 * We ideally want nstcalcenergy=nstener.
 +                 */
 +                if (ir->nstlist > 0)
 +                {
 +                    ir->nstcalcenergy = lcd(ir->nstenergy,ir->nstlist);
 +                }
 +                else
 +                {
 +                    ir->nstcalcenergy = ir->nstenergy;
 +                }
 +            }
 +        }
-             sprintf(warn_buf,"Setting nstcalcenergy (%d) equal to nstenergy (%d)",ir->nstcalcenergy,ir->nstenergy);
-             ir->nstcalcenergy = ir->nstenergy;
++        else if ( (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy) ||
++                  (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
++                   (ir->nstcalcenergy > ir->fepvals->nstdhdl) ) )
++
 +        {
++            const char *nsten="nstenergy";
++            const char *nstdh="nstdhdl";
++            const char *min_name=nsten;
++            int min_nst=ir->nstenergy;
++
++            /* find the smallest of ( nstenergy, nstdhdl ) */
++            if (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
++                (ir->fepvals->nstdhdl < ir->nstenergy) )
++            {
++                min_nst=ir->fepvals->nstdhdl;
++                min_name=nstdh;
++            }
 +            /* If the user sets nstenergy small, we should respect that */
-         if (ir->nstcalcenergy > 1)
++            sprintf(warn_buf,
++                    "Setting nstcalcenergy (%d) equal to %s (%d)",
++                    ir->nstcalcenergy,min_name, min_nst);
++            warning_note(wi,warn_buf);
++            ir->nstcalcenergy = min_nst;
 +        }
 +
 +        if (ir->epc != epcNO)
 +        {
 +            if (ir->nstpcouple < 0)
 +            {
 +                ir->nstpcouple = ir_optimal_nstpcouple(ir);
 +            }
 +        }
 +        if (IR_TWINRANGE(*ir))
 +        {
 +            check_nst("nstlist",ir->nstlist,
 +                      "nstcalcenergy",&ir->nstcalcenergy,wi);
 +            if (ir->epc != epcNO)
 +            {
 +                check_nst("nstlist",ir->nstlist,
 +                          "nstpcouple",&ir->nstpcouple,wi); 
 +            }
 +        }
 +
-             /* for storing exact averages nstenergy should be
-              * a multiple of nstcalcenergy
-              */
-             check_nst("nstcalcenergy",ir->nstcalcenergy,
-                       "nstenergy",&ir->nstenergy,wi);
++        if (ir->nstcalcenergy > 0)
 +        {
-                           "nstdhdl",&ir->expandedvals->nstexpanded,wi);
 +            if (ir->efep != efepNO)
 +            {
 +                /* nstdhdl should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy",ir->nstcalcenergy,
 +                          "nstdhdl",&ir->fepvals->nstdhdl,wi);
 +                /* nstexpanded should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy",ir->nstcalcenergy,
-       CHECK(fep->delta_lambda > 0 && ((fep->init_fep_state !=0) ||  (fep->init_lambda !=0)));
++                          "nstexpanded",&ir->expandedvals->nstexpanded,wi);
 +            }
++            /* for storing exact averages nstenergy should be
++             * a multiple of nstcalcenergy
++             */
++            check_nst("nstcalcenergy",ir->nstcalcenergy,
++                      "nstenergy",&ir->nstenergy,wi);
 +        }
 +    }
 +
 +  /* LD STUFF */
 +  if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +      ir->bContinuation && ir->ld_seed != -1) {
 +      warning_note(wi,"You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +  }
 +
 +  /* TPI STUFF */
 +  if (EI_TPI(ir->eI)) {
 +    sprintf(err_buf,"TPI only works with pbc = %s",epbc_names[epbcXYZ]);
 +    CHECK(ir->ePBC != epbcXYZ);
 +    sprintf(err_buf,"TPI only works with ns = %s",ens_names[ensGRID]);
 +    CHECK(ir->ns_type != ensGRID);
 +    sprintf(err_buf,"with TPI nstlist should be larger than zero");
 +    CHECK(ir->nstlist <= 0);
 +    sprintf(err_buf,"TPI does not work with full electrostatics other than PME");
 +    CHECK(EEL_FULL(ir->coulombtype) && !EEL_PME(ir->coulombtype));
 +  }
 +
 +  /* SHAKE / LINCS */
 +  if ( (opts->nshake > 0) && (opts->bMorse) ) {
 +      sprintf(warn_buf,
 +              "Using morse bond-potentials while constraining bonds is useless");
 +      warning(wi,warn_buf);
 +  }
 +
 +  if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +      ir->bContinuation && ir->ld_seed != -1) {
 +      warning_note(wi,"You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +  }
 +  /* verify simulated tempering options */
 +
 +  if (ir->bSimTemp) {
 +      gmx_bool bAllTempZero = TRUE;
 +      for (i=0;i<fep->n_lambda;i++)
 +      {
 +          sprintf(err_buf,"Entry %d for %s must be between 0 and 1, instead is %g",i,efpt_names[efptTEMPERATURE],fep->all_lambda[efptTEMPERATURE][i]);
 +          CHECK((fep->all_lambda[efptTEMPERATURE][i] < 0) || (fep->all_lambda[efptTEMPERATURE][i] > 1));
 +          if (fep->all_lambda[efptTEMPERATURE][i] > 0)
 +          {
 +              bAllTempZero = FALSE;
 +          }
 +      }
 +      sprintf(err_buf,"if simulated tempering is on, temperature-lambdas may not be all zero");
 +      CHECK(bAllTempZero==TRUE);
 +
 +      sprintf(err_buf,"Simulated tempering is currently only compatible with md-vv");
 +      CHECK(ir->eI != eiVV);
 +
 +      /* check compatability of the temperature coupling with simulated tempering */
 +
 +      if (ir->etc == etcNOSEHOOVER) {
 +          sprintf(warn_buf,"Nose-Hoover based temperature control such as [%s] my not be entirelyconsistent with simulated tempering",etcoupl_names[ir->etc]);
 +          warning_note(wi,warn_buf);
 +      }
 +
 +      /* check that the temperatures make sense */
 +
 +      sprintf(err_buf,"Higher simulated tempering temperature (%g) must be >= than the simulated tempering lower temperature (%g)",ir->simtempvals->simtemp_high,ir->simtempvals->simtemp_low);
 +      CHECK(ir->simtempvals->simtemp_high <= ir->simtempvals->simtemp_low);
 +
 +      sprintf(err_buf,"Higher simulated tempering temperature (%g) must be >= zero",ir->simtempvals->simtemp_high);
 +      CHECK(ir->simtempvals->simtemp_high <= 0);
 +
 +      sprintf(err_buf,"Lower simulated tempering temperature (%g) must be >= zero",ir->simtempvals->simtemp_low);
 +      CHECK(ir->simtempvals->simtemp_low <= 0);
 +  }
 +
 +  /* verify free energy options */
 +
 +  if (ir->efep != efepNO) {
 +      fep = ir->fepvals;
 +      sprintf(err_buf,"The soft-core power is %d and can only be 1 or 2",
 +              fep->sc_power);
 +      CHECK(fep->sc_alpha!=0 && fep->sc_power!=1 && fep->sc_power!=2);
 +
 +      sprintf(err_buf,"The soft-core sc-r-power is %d and can only be 6 or 48",
 +              (int)fep->sc_r_power);
 +      CHECK(fep->sc_alpha!=0 && fep->sc_r_power!=6.0 && fep->sc_r_power!=48.0);
 +
 +      /* check validity of options */
 +      if (fep->n_lambda > 0 && ir->rlist < max(ir->rvdw,ir->rcoulomb))
 +      {
 +          sprintf(warn_buf,
 +                  "For foreign lambda free energy differences it is assumed that the soft-core interactions have no effect beyond the neighborlist cut-off");
 +          warning(wi,warn_buf);
 +      }
 +
 +      sprintf(err_buf,"Can't use postive delta-lambda (%g) if initial state/lambda does not start at zero",fep->delta_lambda);
-       sprintf(err_buf,"initial thermodynamic state %d does not exist, only goes to %d",fep->init_fep_state,fep->n_lambda);
-       CHECK((fep->init_fep_state > fep->n_lambda));
++      CHECK(fep->delta_lambda > 0 && ((fep->init_fep_state > 0) ||  (fep->init_lambda > 0)));
 +
 +      sprintf(err_buf,"Can't use postive delta-lambda (%g) with expanded ensemble simulations",fep->delta_lambda);
 +      CHECK(fep->delta_lambda > 0 && (ir->efep == efepEXPANDED));
 +
 +      sprintf(err_buf,"Free-energy not implemented for Ewald");
 +      CHECK(ir->coulombtype==eelEWALD);
 +
 +      /* check validty of lambda inputs */
-     if ((nfep[efptFEP] == 0) && (fep->init_lambda >= 0) && (fep->init_lambda <= 1))
++      if (fep->n_lambda == 0)
++      {
++          /* Clear output in case of no states:*/
++          sprintf(err_buf,"init-lambda-state set to %d: no lambda states are defined.",fep->init_fep_state);
++          CHECK((fep->init_fep_state>=0) && (fep->n_lambda==0));
++      }
++      else
++      {
++          sprintf(err_buf,"initial thermodynamic state %d does not exist, only goes to %d",fep->init_fep_state,fep->n_lambda-1);
++          CHECK((fep->init_fep_state >= fep->n_lambda));
++      }
++
++      sprintf(err_buf,"Lambda state must be set, either with init-lambda-state or with init-lambda");
++      CHECK((fep->init_fep_state < 0) && (fep->init_lambda <0));
++
++      sprintf(err_buf,"init-lambda=%g while init-lambda-state=%d. Lambda state must be set either with init-lambda-state or with init-lambda, but not both",
++              fep->init_lambda, fep->init_fep_state);
++      CHECK((fep->init_fep_state >= 0) && (fep->init_lambda >= 0));
++
++
++
++      if((fep->init_lambda >= 0) && (fep->delta_lambda == 0))
++      {
++          int n_lambda_terms;
++          n_lambda_terms=0;
++          for (i=0;i<efptNR;i++)
++          {
++              if (fep->separate_dvdl[i])
++              {
++                  n_lambda_terms++;
++              }
++          }
++          if (n_lambda_terms > 1)
++          {
++              sprintf(warn_buf,"If lambda vector states (fep-lambdas, coul-lambdas etc.) are set, don't use init-lambda to set lambda state (except for slow growth). Use init-lambda-state instead.");
++              warning(wi, warn_buf);
++          }
++
++          if (n_lambda_terms < 2 && fep->n_lambda > 0)
++          {
++              warning_note(wi,
++                           "init-lambda is deprecated for setting lambda state (except for slow growth). Use init-lambda-state instead.");
++          }
++      }
 +
 +      for (j=0;j<efptNR;j++)
 +      {
 +          for (i=0;i<fep->n_lambda;i++)
 +          {
 +              sprintf(err_buf,"Entry %d for %s must be between 0 and 1, instead is %g",i,efpt_names[j],fep->all_lambda[j][i]);
 +              CHECK((fep->all_lambda[j][i] < 0) || (fep->all_lambda[j][i] > 1));
 +          }
 +      }
 +
 +      if ((fep->sc_alpha>0) && (!fep->bScCoul))
 +      {
 +          for (i=0;i<fep->n_lambda;i++)
 +          {
 +              sprintf(err_buf,"For state %d, vdw-lambdas (%f) is changing with vdw softcore, while coul-lambdas (%f) is nonzero without coulomb softcore: this will lead to crashes, and is not supported.",i,fep->all_lambda[efptVDW][i],
 +                      fep->all_lambda[efptCOUL][i]);
 +              CHECK((fep->sc_alpha>0) &&
 +                    (((fep->all_lambda[efptCOUL][i] > 0.0) &&
 +                      (fep->all_lambda[efptCOUL][i] < 1.0)) &&
 +                     ((fep->all_lambda[efptVDW][i] > 0.0) &&
 +                      (fep->all_lambda[efptVDW][i] < 1.0))));
 +          }
 +      }
 +
 +      if ((fep->bScCoul) && (EEL_PME(ir->coulombtype)))
 +      {
 +          sprintf(warn_buf,"With coulomb soft core, the reciprocal space calculation will not necessarily cancel.  It may be necessary to decrease the reciprocal space energy, and increase the cutoff radius to get sufficiently close matches to energies with free energy turned off.");
 +          warning(wi, warn_buf);
 +      }
 +
 +      /*  Free Energy Checks -- In an ideal world, slow growth and FEP would
 +          be treated differently, but that's the next step */
 +
 +      for (i=0;i<efptNR;i++) {
 +          for (j=0;j<fep->n_lambda;j++) {
 +              sprintf(err_buf,"%s[%d] must be between 0 and 1",efpt_names[i],j);
 +              CHECK((fep->all_lambda[i][j] < 0) || (fep->all_lambda[i][j] > 1));
 +          }
 +      }
 +  }
 +
 +  if ((ir->bSimTemp) || (ir->efep == efepEXPANDED)) {
 +      fep = ir->fepvals;
 +      expand = ir->expandedvals;
 +
 +      /* checking equilibration of weights inputs for validity */
 +
 +      sprintf(err_buf,"weight-equil-number-all-lambda (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_n_at_lam,elmceq_names[elmceqNUMATLAM]);
 +      CHECK((expand->equil_n_at_lam>0) && (expand->elmceq!=elmceqNUMATLAM));
 +
 +      sprintf(err_buf,"weight-equil-number-samples (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_samples,elmceq_names[elmceqSAMPLES]);
 +      CHECK((expand->equil_samples>0) && (expand->elmceq!=elmceqSAMPLES));
 +
 +      sprintf(err_buf,"weight-equil-number-steps (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_steps,elmceq_names[elmceqSTEPS]);
 +      CHECK((expand->equil_steps>0) && (expand->elmceq!=elmceqSTEPS));
 +
 +      sprintf(err_buf,"weight-equil-wl-delta (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_samples,elmceq_names[elmceqWLDELTA]);
 +      CHECK((expand->equil_wl_delta>0) && (expand->elmceq!=elmceqWLDELTA));
 +
 +      sprintf(err_buf,"weight-equil-count-ratio (%f) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_ratio,elmceq_names[elmceqRATIO]);
 +      CHECK((expand->equil_ratio>0) && (expand->elmceq!=elmceqRATIO));
 +
 +      sprintf(err_buf,"weight-equil-number-all-lambda (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_n_at_lam,elmceq_names[elmceqNUMATLAM]);
 +      CHECK((expand->equil_n_at_lam<=0) && (expand->elmceq==elmceqNUMATLAM));
 +
 +      sprintf(err_buf,"weight-equil-number-samples (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_samples,elmceq_names[elmceqSAMPLES]);
 +      CHECK((expand->equil_samples<=0) && (expand->elmceq==elmceqSAMPLES));
 +
 +      sprintf(err_buf,"weight-equil-number-steps (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_steps,elmceq_names[elmceqSTEPS]);
 +      CHECK((expand->equil_steps<=0) && (expand->elmceq==elmceqSTEPS));
 +
 +      sprintf(err_buf,"weight-equil-wl-delta (%f) must be > 0 if lmc-weights-equil=%s",
 +              expand->equil_wl_delta,elmceq_names[elmceqWLDELTA]);
 +      CHECK((expand->equil_wl_delta<=0) && (expand->elmceq==elmceqWLDELTA));
 +
 +      sprintf(err_buf,"weight-equil-count-ratio (%f) must be > 0 if lmc-weights-equil=%s",
 +              expand->equil_ratio,elmceq_names[elmceqRATIO]);
 +      CHECK((expand->equil_ratio<=0) && (expand->elmceq==elmceqRATIO));
 +
 +      sprintf(err_buf,"lmc-weights-equil=%s only possible when lmc-stats = %s or lmc-stats %s",
 +              elmceq_names[elmceqWLDELTA],elamstats_names[elamstatsWL],elamstats_names[elamstatsWWL]);
 +      CHECK((expand->elmceq==elmceqWLDELTA) && (!EWL(expand->elamstats)));
 +
 +      sprintf(err_buf,"lmc-repeats (%d) must be greater than 0",expand->lmc_repeats);
 +      CHECK((expand->lmc_repeats <= 0));
 +      sprintf(err_buf,"minimum-var-min (%d) must be greater than 0",expand->minvarmin);
 +      CHECK((expand->minvarmin <= 0));
 +      sprintf(err_buf,"weight-c-range (%d) must be greater or equal to 0",expand->c_range);
 +      CHECK((expand->c_range < 0));
 +      sprintf(err_buf,"init-lambda-state (%d) must be zero if lmc-forced-nstart (%d)> 0 and lmc-move != 'no'",
 +              fep->init_fep_state, expand->lmc_forced_nstart);
 +      CHECK((fep->init_fep_state!=0) && (expand->lmc_forced_nstart>0) && (expand->elmcmove!=elmcmoveNO));
 +      sprintf(err_buf,"lmc-forced-nstart (%d) must not be negative",expand->lmc_forced_nstart);
 +      CHECK((expand->lmc_forced_nstart < 0));
 +      sprintf(err_buf,"init-lambda-state (%d) must be in the interval [0,number of lambdas)",fep->init_fep_state);
 +      CHECK((fep->init_fep_state < 0) || (fep->init_fep_state >= fep->n_lambda));
 +
 +      sprintf(err_buf,"init-wl-delta (%f) must be greater than or equal to 0",expand->init_wl_delta);
 +      CHECK((expand->init_wl_delta < 0));
 +      sprintf(err_buf,"wl-ratio (%f) must be between 0 and 1",expand->wl_ratio);
 +      CHECK((expand->wl_ratio <= 0) || (expand->wl_ratio >= 1));
 +      sprintf(err_buf,"wl-scale (%f) must be between 0 and 1",expand->wl_scale);
 +      CHECK((expand->wl_scale <= 0) || (expand->wl_scale >= 1));
 +
 +      /* if there is no temperature control, we need to specify an MC temperature */
 +      sprintf(err_buf,"If there is no temperature control, and lmc-mcmove!= 'no',mc_temperature must be set to a positive number");
 +      if (expand->nstTij > 0)
 +      {
 +          sprintf(err_buf,"nst-transition-matrix (%d) must be an integer multiple of nstlog (%d)",
 +                  expand->nstTij,ir->nstlog);
 +          CHECK((mod(expand->nstTij,ir->nstlog)!=0));
 +      }
 +  }
 +
 +  /* PBC/WALLS */
 +  sprintf(err_buf,"walls only work with pbc=%s",epbc_names[epbcXY]);
 +  CHECK(ir->nwall && ir->ePBC!=epbcXY);
 +
 +  /* VACUUM STUFF */
 +  if (ir->ePBC != epbcXYZ && ir->nwall != 2) {
 +    if (ir->ePBC == epbcNONE) {
 +      if (ir->epc != epcNO) {
 +          warning(wi,"Turning off pressure coupling for vacuum system");
 +          ir->epc = epcNO;
 +      }
 +    } else {
 +      sprintf(err_buf,"Can not have pressure coupling with pbc=%s",
 +            epbc_names[ir->ePBC]);
 +      CHECK(ir->epc != epcNO);
 +    }
 +    sprintf(err_buf,"Can not have Ewald with pbc=%s",epbc_names[ir->ePBC]);
 +    CHECK(EEL_FULL(ir->coulombtype));
 +
 +    sprintf(err_buf,"Can not have dispersion correction with pbc=%s",
 +          epbc_names[ir->ePBC]);
 +    CHECK(ir->eDispCorr != edispcNO);
 +  }
 +
 +  if (ir->rlist == 0.0) {
 +    sprintf(err_buf,"can only have neighborlist cut-off zero (=infinite)\n"
 +          "with coulombtype = %s or coulombtype = %s\n"
 +          "without periodic boundary conditions (pbc = %s) and\n"
 +          "rcoulomb and rvdw set to zero",
 +          eel_names[eelCUT],eel_names[eelUSER],epbc_names[epbcNONE]);
 +    CHECK(((ir->coulombtype != eelCUT) && (ir->coulombtype != eelUSER)) ||
 +        (ir->ePBC     != epbcNONE) ||
 +        (ir->rcoulomb != 0.0)      || (ir->rvdw != 0.0));
 +
 +    if (ir->nstlist < 0) {
 +        warning_error(wi,"Can not have heuristic neighborlist updates without cut-off");
 +    }
 +    if (ir->nstlist > 0) {
 +        warning_note(wi,"Simulating without cut-offs is usually (slightly) faster with nstlist=0, nstype=simple and particle decomposition");
 +    }
 +  }
 +
 +  /* COMM STUFF */
 +  if (ir->nstcomm == 0) {
 +    ir->comm_mode = ecmNO;
 +  }
 +  if (ir->comm_mode != ecmNO) {
 +    if (ir->nstcomm < 0) {
 +        warning(wi,"If you want to remove the rotation around the center of mass, you should set comm_mode = Angular instead of setting nstcomm < 0. nstcomm is modified to its absolute value");
 +      ir->nstcomm = abs(ir->nstcomm);
 +    }
 +
 +    if (ir->nstcalcenergy > 0 && ir->nstcomm < ir->nstcalcenergy) {
 +        warning_note(wi,"nstcomm < nstcalcenergy defeats the purpose of nstcalcenergy, setting nstcomm to nstcalcenergy");
 +        ir->nstcomm = ir->nstcalcenergy;
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR) {
 +      sprintf(err_buf,"Can not remove the rotation around the center of mass with periodic molecules");
 +      CHECK(ir->bPeriodicMols);
 +      if (ir->ePBC != epbcNONE)
 +          warning(wi,"Removing the rotation around the center of mass in a periodic system (this is not a problem when you have only one molecule).");
 +    }
 +  }
 +
 +  if (EI_STATE_VELOCITY(ir->eI) && ir->ePBC == epbcNONE && ir->comm_mode != ecmANGULAR) {
 +      warning_note(wi,"Tumbling and or flying ice-cubes: We are not removing rotation around center of mass in a non-periodic system. You should probably set comm_mode = ANGULAR.");
 +  }
 +  
 +  sprintf(err_buf,"Twin-range neighbour searching (NS) with simple NS"
 +        " algorithm not implemented");
 +  CHECK(((ir->rcoulomb > ir->rlist) || (ir->rvdw > ir->rlist))
 +      && (ir->ns_type == ensSIMPLE));
 +
 +  /* TEMPERATURE COUPLING */
 +  if (ir->etc == etcYES)
 +    {
 +        ir->etc = etcBERENDSEN;
 +        warning_note(wi,"Old option for temperature coupling given: "
 +                     "changing \"yes\" to \"Berendsen\"\n");
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER) || (ir->epc == epcMTTK))
 +    {
 +        if (ir->opts.nhchainlength < 1)
 +        {
 +            sprintf(warn_buf,"number of Nose-Hoover chains (currently %d) cannot be less than 1,reset to 1\n",ir->opts.nhchainlength);
 +            ir->opts.nhchainlength =1;
 +            warning(wi,warn_buf);
 +        }
 +        
 +        if (ir->etc==etcNOSEHOOVER && !EI_VV(ir->eI) && ir->opts.nhchainlength > 1)
 +        {
 +            warning_note(wi,"leapfrog does not yet support Nose-Hoover chains, nhchainlength reset to 1");
 +            ir->opts.nhchainlength = 1;
 +        }
 +    }
 +    else
 +    {
 +        ir->opts.nhchainlength = 0;
 +    }
 +
 +    if (ir->eI == eiVVAK) {
 +        sprintf(err_buf,"%s implemented primarily for validation, and requires nsttcouple = 1 and nstpcouple = 1.",
 +                ei_names[eiVVAK]);
 +        CHECK((ir->nsttcouple != 1) || (ir->nstpcouple != 1));
 +    }
 +
 +    if (ETC_ANDERSEN(ir->etc))
 +    {
 +        sprintf(err_buf,"%s temperature control not supported for integrator %s.",etcoupl_names[ir->etc],ei_names[ir->eI]);
 +        CHECK(!(EI_VV(ir->eI)));
 +
 +        for (i=0;i<ir->opts.ngtc;i++)
 +        {
 +            sprintf(err_buf,"all tau_t must currently be equal using Andersen temperature control, violated for group %d",i);
 +            CHECK(ir->opts.tau_t[0] != ir->opts.tau_t[i]);
 +            sprintf(err_buf,"all tau_t must be postive using Andersen temperature control, tau_t[%d]=%10.6f",
 +                    i,ir->opts.tau_t[i]);
 +            CHECK(ir->opts.tau_t[i]<0);
 +        }
 +        if (ir->nstcomm > 0 && (ir->etc == etcANDERSEN)) {
 +            sprintf(warn_buf,"Center of mass removal not necessary for %s.  All velocities of coupled groups are rerandomized periodically, so flying ice cube errors will not occur.",etcoupl_names[ir->etc]);
 +            warning_note(wi,warn_buf);
 +        }
 +
 +        sprintf(err_buf,"nstcomm must be 1, not %d for %s, as velocities of atoms in coupled groups are randomized every time step",ir->nstcomm,etcoupl_names[ir->etc]);
 +        CHECK(ir->nstcomm > 1 && (ir->etc == etcANDERSEN));
 +
 +        for (i=0;i<ir->opts.ngtc;i++)
 +        {
 +            int nsteps = (int)(ir->opts.tau_t[i]/ir->delta_t);
 +            sprintf(err_buf,"tau_t/delta_t for group %d for temperature control method %s must be a multiple of nstcomm (%d), as velocities of atoms in coupled groups are randomized every time step. The input tau_t (%8.3f) leads to %d steps per randomization",i,etcoupl_names[ir->etc],ir->nstcomm,ir->opts.tau_t[i],nsteps);
 +            CHECK((nsteps % ir->nstcomm) && (ir->etc == etcANDERSENMASSIVE));
 +        }
 +    }
 +    if (ir->etc == etcBERENDSEN)
 +    {
 +        sprintf(warn_buf,"The %s thermostat does not generate the correct kinetic energy distribution. You might want to consider using the %s thermostat.",
 +                ETCOUPLTYPE(ir->etc),ETCOUPLTYPE(etcVRESCALE));
 +        warning_note(wi,warn_buf);
 +    }
 +
 +    if ((ir->etc==etcNOSEHOOVER || ETC_ANDERSEN(ir->etc))
 +        && ir->epc==epcBERENDSEN)
 +    {
 +        sprintf(warn_buf,"Using Berendsen pressure coupling invalidates the "
 +                "true ensemble for the thermostat");
 +        warning(wi,warn_buf);
 +    }
 +
 +    /* PRESSURE COUPLING */
 +    if (ir->epc == epcISOTROPIC)
 +    {
 +        ir->epc = epcBERENDSEN;
 +        warning_note(wi,"Old option for pressure coupling given: "
 +                     "changing \"Isotropic\" to \"Berendsen\"\n"); 
 +    }
 +
 +    if (ir->epc != epcNO)
 +    {
 +        dt_pcoupl = ir->nstpcouple*ir->delta_t;
 +
 +        sprintf(err_buf,"tau-p must be > 0 instead of %g\n",ir->tau_p);
 +        CHECK(ir->tau_p <= 0);
 +
 +        if (ir->tau_p/dt_pcoupl < pcouple_min_integration_steps(ir->epc))
 +        {
 +            sprintf(warn_buf,"For proper integration of the %s barostat, tau-p (%g) should be at least %d times larger than nstpcouple*dt (%g)",
 +                    EPCOUPLTYPE(ir->epc),ir->tau_p,pcouple_min_integration_steps(ir->epc),dt_pcoupl);
 +            warning(wi,warn_buf);
 +        }
 +
 +        sprintf(err_buf,"compressibility must be > 0 when using pressure"
 +                " coupling %s\n",EPCOUPLTYPE(ir->epc));
 +        CHECK(ir->compress[XX][XX] < 0 || ir->compress[YY][YY] < 0 ||
 +              ir->compress[ZZ][ZZ] < 0 ||
 +              (trace(ir->compress) == 0 && ir->compress[YY][XX] <= 0 &&
 +               ir->compress[ZZ][XX] <= 0 && ir->compress[ZZ][YY] <= 0));
 +        
 +        if (epcPARRINELLORAHMAN == ir->epc && opts->bGenVel)
 +        {
 +            sprintf(warn_buf,
 +                    "You are generating velocities so I am assuming you "
 +                    "are equilibrating a system. You are using "
 +                    "%s pressure coupling, but this can be "
 +                    "unstable for equilibration. If your system crashes, try "
 +                    "equilibrating first with Berendsen pressure coupling. If "
 +                    "you are not equilibrating the system, you can probably "
 +                    "ignore this warning.",
 +                    epcoupl_names[ir->epc]);
 +            warning(wi,warn_buf);
 +        }
 +    }
 +
 +    if (EI_VV(ir->eI))
 +    {
 +        if (ir->epc > epcNO)
 +        {
 +            if ((ir->epc!=epcBERENDSEN) && (ir->epc!=epcMTTK))
 +            {
 +                warning_error(wi,"for md-vv and md-vv-avek, can only use Berendsen and Martyna-Tuckerman-Tobias-Klein (MTTK) equations for pressure control; MTTK is equivalent to Parrinello-Rahman.");
 +            }
 +        }
 +    }
 +
 +  /* ELECTROSTATICS */
 +  /* More checks are in triple check (grompp.c) */
 +
 +  if (ir->coulombtype == eelSWITCH) {
 +    sprintf(warn_buf,"coulombtype = %s is only for testing purposes and can lead to serious "
 +            "artifacts, advice: use coulombtype = %s",
 +          eel_names[ir->coulombtype],
 +          eel_names[eelRF_ZERO]);
 +    warning(wi,warn_buf);
 +  }
 +
 +  if (ir->epsilon_r!=1 && ir->implicit_solvent==eisGBSA) {
 +    sprintf(warn_buf,"epsilon-r = %g with GB implicit solvent, will use this value for inner dielectric",ir->epsilon_r);
 +    warning_note(wi,warn_buf);
 +  }
 +
 +  if (EEL_RF(ir->coulombtype) && ir->epsilon_rf==1 && ir->epsilon_r!=1) {
 +    sprintf(warn_buf,"epsilon-r = %g and epsilon-rf = 1 with reaction field, proceeding assuming old format and exchanging epsilon-r and epsilon-rf",ir->epsilon_r);
 +    warning(wi,warn_buf);
 +    ir->epsilon_rf = ir->epsilon_r;
 +    ir->epsilon_r  = 1.0;
 +  }
 +
 +  if (getenv("GALACTIC_DYNAMICS") == NULL) {  
 +    sprintf(err_buf,"epsilon-r must be >= 0 instead of %g\n",ir->epsilon_r);
 +    CHECK(ir->epsilon_r < 0);
 +  }
 +  
 +  if (EEL_RF(ir->coulombtype)) {
 +    /* reaction field (at the cut-off) */
 +    
 +    if (ir->coulombtype == eelRF_ZERO) {
 +       sprintf(warn_buf,"With coulombtype = %s, epsilon-rf must be 0, assuming you meant epsilon_rf=0",
 +             eel_names[ir->coulombtype]);
 +        CHECK(ir->epsilon_rf != 0);
 +        ir->epsilon_rf = 0.0;
 +    }
 +
 +    sprintf(err_buf,"epsilon-rf must be >= epsilon-r");
 +    CHECK((ir->epsilon_rf < ir->epsilon_r && ir->epsilon_rf != 0) ||
 +        (ir->epsilon_r == 0));
 +    if (ir->epsilon_rf == ir->epsilon_r) {
 +      sprintf(warn_buf,"Using epsilon-rf = epsilon-r with %s does not make sense",
 +            eel_names[ir->coulombtype]);
 +      warning(wi,warn_buf);
 +    }
 +  }
 +  /* Allow rlist>rcoulomb for tabulated long range stuff. This just
 +   * means the interaction is zero outside rcoulomb, but it helps to
 +   * provide accurate energy conservation.
 +   */
 +  if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype)) {
 +    if (EEL_SWITCHED(ir->coulombtype)) {
 +      sprintf(err_buf,
 +            "With coulombtype = %s rcoulomb_switch must be < rcoulomb. Or, better: Use the potential modifier options!",
 +            eel_names[ir->coulombtype]);
 +      CHECK(ir->rcoulomb_switch >= ir->rcoulomb);
 +    }
 +  } else if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype)) {
 +      if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE) {
 +          sprintf(err_buf,"With coulombtype = %s, rcoulomb should be >= rlist unless you use a potential modifier",
 +                  eel_names[ir->coulombtype]);
 +          CHECK(ir->rlist > ir->rcoulomb);
 +      }
 +  }
 +
 +  if(ir->coulombtype==eelSWITCH || ir->coulombtype==eelSHIFT ||
 +     ir->vdwtype==evdwSWITCH || ir->vdwtype==evdwSHIFT)
 +  {
 +      sprintf(warn_buf,
 +              "The switch/shift interaction settings are just for compatibility; you will get better"
 +              "performance from applying potential modifiers to your interactions!\n");
 +      warning_note(wi,warn_buf);
 +  }
 +
 +  if (EEL_FULL(ir->coulombtype))
 +  {
 +      if (ir->coulombtype==eelPMESWITCH || ir->coulombtype==eelPMEUSER ||
 +          ir->coulombtype==eelPMEUSERSWITCH)
 +      {
 +          sprintf(err_buf,"With coulombtype = %s, rcoulomb must be <= rlist",
 +                  eel_names[ir->coulombtype]);
 +          CHECK(ir->rcoulomb > ir->rlist);
 +      }
 +      else if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +      {
 +          if (ir->coulombtype == eelPME || ir->coulombtype == eelP3M_AD)
 +          {
 +              sprintf(err_buf,
 +                      "With coulombtype = %s (without modifier), rcoulomb must be equal to rlist,\n"
 +                      "or rlistlong if nstcalclr=1. For optimal energy conservation,consider using\n"
 +                      "a potential modifier.",eel_names[ir->coulombtype]);
 +              if(ir->nstcalclr==1)
 +              {
 +                  CHECK(ir->rcoulomb != ir->rlist && ir->rcoulomb != ir->rlistlong);
 +              }
 +              else
 +              {
 +                  CHECK(ir->rcoulomb != ir->rlist);
 +              }
 +          }
 +      }
 +  }
 +
 +  if (EEL_PME(ir->coulombtype)) {
 +    if (ir->pme_order < 3) {
 +        warning_error(wi,"pme-order can not be smaller than 3");
 +    }
 +  }
 +
 +  if (ir->nwall==2 && EEL_FULL(ir->coulombtype)) {
 +    if (ir->ewald_geometry == eewg3D) {
 +      sprintf(warn_buf,"With pbc=%s you should use ewald-geometry=%s",
 +            epbc_names[ir->ePBC],eewg_names[eewg3DC]);
 +      warning(wi,warn_buf);
 +    }
 +    /* This check avoids extra pbc coding for exclusion corrections */
 +    sprintf(err_buf,"wall-ewald-zfac should be >= 2");
 +    CHECK(ir->wall_ewald_zfac < 2);
 +  }
 +
 +  if (EVDW_SWITCHED(ir->vdwtype)) {
 +    sprintf(err_buf,"With vdwtype = %s rvdw-switch must be < rvdw. Or, better - use a potential modifier.",
 +          evdw_names[ir->vdwtype]);
 +    CHECK(ir->rvdw_switch >= ir->rvdw);
 +  } else if (ir->vdwtype == evdwCUT) {
 +      if (ir->cutoff_scheme == ecutsGROUP && ir->vdw_modifier == eintmodNONE) {
 +          sprintf(err_buf,"With vdwtype = %s, rvdw must be >= rlist unless you use a potential modifier",evdw_names[ir->vdwtype]);
 +          CHECK(ir->rlist > ir->rvdw);
 +      }
 +  }
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
 +            && (ir->rlistlong <= ir->rcoulomb))
 +        {
 +            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi,warn_buf);
 +        }
 +        if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw))
 +        {
 +            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +
 +  if (ir->vdwtype == evdwUSER && ir->eDispCorr != edispcNO) {
 +      warning_note(wi,"You have selected user tables with dispersion correction, the dispersion will be corrected to -C6/r^6 beyond rvdw_switch (the tabulated interaction between rvdw_switch and rvdw will not be double counted). Make sure that you really want dispersion correction to -C6/r^6.");
 +  }
 +
 +  if (ir->nstlist == -1) {
 +    sprintf(err_buf,"With nstlist=-1 rvdw and rcoulomb should be smaller than rlist to account for diffusion and possibly charge-group radii");
 +    CHECK(ir->rvdw >= ir->rlist || ir->rcoulomb >= ir->rlist);
 +  }
 +  sprintf(err_buf,"nstlist can not be smaller than -1");
 +  CHECK(ir->nstlist < -1);
 +
 +  if (ir->eI == eiLBFGS && (ir->coulombtype==eelCUT || ir->vdwtype==evdwCUT)
 +     && ir->rvdw != 0) {
 +    warning(wi,"For efficient BFGS minimization, use switch/shift/pme instead of cut-off.");
 +  }
 +
 +  if (ir->eI == eiLBFGS && ir->nbfgscorr <= 0) {
 +    warning(wi,"Using L-BFGS with nbfgscorr<=0 just gets you steepest descent.");
 +  }
 +
 +    /* ENERGY CONSERVATION */
 +    if (ir_NVE(ir) && ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0 && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf,"You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
 +                    evdw_names[evdwSHIFT]);
 +            warning_note(wi,warn_buf);
 +        }
 +        if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0 && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf,"You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
 +                    eel_names[eelPMESWITCH],eel_names[eelRF_ZERO]);
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +
 +  /* IMPLICIT SOLVENT */
 +  if(ir->coulombtype==eelGB_NOTUSED)
 +  {
 +    ir->coulombtype=eelCUT;
 +    ir->implicit_solvent=eisGBSA;
 +    fprintf(stderr,"Note: Old option for generalized born electrostatics given:\n"
 +          "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
 +            "setting implicit-solvent value to \"GBSA\" in input section.\n");
 +  }
 +
 +  if(ir->sa_algorithm==esaSTILL)
 +  {
 +    sprintf(err_buf,"Still SA algorithm not available yet, use %s or %s instead\n",esa_names[esaAPPROX],esa_names[esaNO]);
 +    CHECK(ir->sa_algorithm == esaSTILL);
 +  }
 +  
 +  if(ir->implicit_solvent==eisGBSA)
 +  {
 +    sprintf(err_buf,"With GBSA implicit solvent, rgbradii must be equal to rlist.");
 +    CHECK(ir->rgbradii != ir->rlist);
 +        
 +    if(ir->coulombtype!=eelCUT)
 +        {
 +                sprintf(err_buf,"With GBSA, coulombtype must be equal to %s\n",eel_names[eelCUT]);
 +                CHECK(ir->coulombtype!=eelCUT);
 +        }
 +        if(ir->vdwtype!=evdwCUT)
 +        {
 +                sprintf(err_buf,"With GBSA, vdw-type must be equal to %s\n",evdw_names[evdwCUT]);
 +                CHECK(ir->vdwtype!=evdwCUT);
 +        }
 +    if(ir->nstgbradii<1)
 +    {
 +      sprintf(warn_buf,"Using GBSA with nstgbradii<1, setting nstgbradii=1");
 +      warning_note(wi,warn_buf);
 +      ir->nstgbradii=1;
 +    }
 +    if(ir->sa_algorithm==esaNO)
 +    {
 +      sprintf(warn_buf,"No SA (non-polar) calculation requested together with GB. Are you sure this is what you want?\n");
 +      warning_note(wi,warn_buf);
 +    }
 +    if(ir->sa_surface_tension<0 && ir->sa_algorithm!=esaNO)
 +    {
 +      sprintf(warn_buf,"Value of sa_surface_tension is < 0. Changing it to 2.05016 or 2.25936 kJ/nm^2/mol for Still and HCT/OBC respectively\n");
 +      warning_note(wi,warn_buf);
 +      
 +      if(ir->gb_algorithm==egbSTILL)
 +      {
 +        ir->sa_surface_tension = 0.0049 * CAL2JOULE * 100;
 +      }
 +      else
 +      {
 +        ir->sa_surface_tension = 0.0054 * CAL2JOULE * 100;
 +      }
 +    }
 +    if(ir->sa_surface_tension==0 && ir->sa_algorithm!=esaNO)
 +    {
 +      sprintf(err_buf, "Surface tension set to 0 while SA-calculation requested\n");
 +      CHECK(ir->sa_surface_tension==0 && ir->sa_algorithm!=esaNO);
 +    }
 +    
 +  }
 +
 +    if (ir->bAdress)
 +    {
 +        if (ir->cutoff_scheme != ecutsGROUP)
 +        {
 +            warning_error(wi,"AdresS simulation supports only cutoff-scheme=group");
 +        }
 +        if (!EI_SD(ir->eI))
 +        {
 +            warning_error(wi,"AdresS simulation supports only stochastic dynamics");
 +        }
 +        if (ir->epc != epcNO)
 +        {
 +            warning_error(wi,"AdresS simulation does not support pressure coupling");
 +        }
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            warning_error(wi,"AdresS simulation does not support long-range electrostatics");
 +        }
 +    }
 +}
 +
 +/* count the number of text elemets separated by whitespace in a string.
 +    str = the input string
 +    maxptr = the maximum number of allowed elements
 +    ptr = the output array of pointers to the first character of each element
 +    returns: the number of elements. */
 +int str_nelem(const char *str,int maxptr,char *ptr[])
 +{
 +  int  np=0;
 +  char *copy0,*copy;
 +  
 +  copy0=strdup(str); 
 +  copy=copy0;
 +  ltrim(copy);
 +  while (*copy != '\0') {
 +    if (np >= maxptr)
 +      gmx_fatal(FARGS,"Too many groups on line: '%s' (max is %d)",
 +                str,maxptr);
 +    if (ptr) 
 +      ptr[np]=copy;
 +    np++;
 +    while ((*copy != '\0') && !isspace(*copy))
 +      copy++;
 +    if (*copy != '\0') {
 +      *copy='\0';
 +      copy++;
 +    }
 +    ltrim(copy);
 +  }
 +  if (ptr == NULL)
 +    sfree(copy0);
 +
 +  return np;
 +}
 +
 +/* interpret a number of doubles from a string and put them in an array,
 +   after allocating space for them.
 +   str = the input string
 +   n = the (pre-allocated) number of doubles read
 +   r = the output array of doubles. */
 +static void parse_n_real(char *str,int *n,real **r)
 +{
 +  char *ptr[MAXPTR];
 +  int  i;
 +
 +  *n = str_nelem(str,MAXPTR,ptr);
 +
 +  snew(*r,*n);
 +  for(i=0; i<*n; i++) {
 +    (*r)[i] = strtod(ptr[i],NULL);
 +  }
 +}
 +
 +static void do_fep_params(t_inputrec *ir, char fep_lambda[][STRLEN],char weights[STRLEN]) {
 +
 +    int i,j,max_n_lambda,nweights,nfep[efptNR];
 +    t_lambda *fep = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +    real **count_fep_lambdas;
 +    gmx_bool bOneLambda = TRUE;
 +
 +    snew(count_fep_lambdas,efptNR);
 +
 +    /* FEP input processing */
 +    /* first, identify the number of lambda values for each type.
 +       All that are nonzero must have the same number */
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        parse_n_real(fep_lambda[i],&(nfep[i]),&(count_fep_lambdas[i]));
 +    }
 +
 +    /* now, determine the number of components.  All must be either zero, or equal. */
 +
 +    max_n_lambda = 0;
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (nfep[i] > max_n_lambda) {
 +            max_n_lambda = nfep[i];  /* here's a nonzero one.  All of them
 +                                        must have the same number if its not zero.*/
 +            break;
 +        }
 +    }
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (nfep[i] == 0)
 +        {
 +            ir->fepvals->separate_dvdl[i] = FALSE;
 +        }
 +        else if (nfep[i] == max_n_lambda)
 +        {
 +            if (i!=efptTEMPERATURE)  /* we treat this differently -- not really a reason to compute the derivative with
 +                                        respect to the temperature currently */
 +            {
 +                ir->fepvals->separate_dvdl[i] = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS,"Number of lambdas (%d) for FEP type %s not equal to number of other types (%d)",
 +                      nfep[i],efpt_names[i],max_n_lambda);
 +        }
 +    }
 +    /* we don't print out dhdl if the temperature is changing, since we can't correctly define dhdl in this case */
 +    ir->fepvals->separate_dvdl[efptTEMPERATURE] = FALSE;
 +
 +    /* the number of lambdas is the number we've read in, which is either zero
 +       or the same for all */
 +    fep->n_lambda = max_n_lambda;
 +
 +    /* allocate space for the array of lambda values */
 +    snew(fep->all_lambda,efptNR);
 +    /* if init_lambda is defined, we need to set lambda */
 +    if ((fep->init_lambda > 0) && (fep->n_lambda == 0))
 +    {
 +        ir->fepvals->separate_dvdl[efptFEP] = TRUE;
 +    }
 +    /* otherwise allocate the space for all of the lambdas, and transfer the data */
 +    for (i=0;i<efptNR;i++)
 +    {
 +        snew(fep->all_lambda[i],fep->n_lambda);
 +        if (nfep[i] > 0)  /* if it's zero, then the count_fep_lambda arrays
 +                             are zero */
 +        {
 +            for (j=0;j<fep->n_lambda;j++)
 +            {
 +                fep->all_lambda[i][j] = (double)count_fep_lambdas[i][j];
 +            }
 +            sfree(count_fep_lambdas[i]);
 +        }
 +    }
 +    sfree(count_fep_lambdas);
 +
 +    /* "fep-vals" is either zero or the full number. If zero, we'll need to define fep-lambdas for internal
 +       bookkeeping -- for now, init_lambda */
 +
-   ITYPE ("init-lambda-state", fep->init_fep_state,0);
++    if ((nfep[efptFEP] == 0) && (fep->init_lambda >= 0))
 +    {
 +        for (i=0;i<fep->n_lambda;i++)
 +        {
 +            fep->all_lambda[efptFEP][i] = fep->init_lambda;
 +        }
 +    }
 +
 +    /* check to see if only a single component lambda is defined, and soft core is defined.
 +       In this case, turn on coulomb soft core */
 +
 +    if (max_n_lambda == 0)
 +    {
 +        bOneLambda = TRUE;
 +    }
 +    else
 +    {
 +        for (i=0;i<efptNR;i++)
 +        {
 +            if ((nfep[i] != 0) && (i!=efptFEP))
 +            {
 +                bOneLambda = FALSE;
 +            }
 +        }
 +    }
 +    if ((bOneLambda) && (fep->sc_alpha > 0))
 +    {
 +        fep->bScCoul = TRUE;
 +    }
 +
 +    /* Fill in the others with the efptFEP if they are not explicitly
 +       specified (i.e. nfep[i] == 0).  This means if fep is not defined,
 +       they are all zero. */
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if ((nfep[i] == 0) && (i!=efptFEP))
 +        {
 +            for (j=0;j<fep->n_lambda;j++)
 +            {
 +                fep->all_lambda[i][j] = fep->all_lambda[efptFEP][j];
 +            }
 +        }
 +    }
 +
 +
 +    /* make it easier if sc_r_power = 48 by increasing it to the 4th power, to be in the right scale. */
 +    if (fep->sc_r_power == 48)
 +    {
 +        if (fep->sc_alpha > 0.1)
 +        {
 +            gmx_fatal(FARGS,"sc_alpha (%f) for sc_r_power = 48 should usually be between 0.001 and 0.004", fep->sc_alpha);
 +        }
 +    }
 +
 +    expand = ir->expandedvals;
 +    /* now read in the weights */
 +    parse_n_real(weights,&nweights,&(expand->init_lambda_weights));
 +    if (nweights == 0)
 +    {
 +        expand->bInit_weights = FALSE;
 +        snew(expand->init_lambda_weights,fep->n_lambda); /* initialize to zero */
 +    }
 +    else if (nweights != fep->n_lambda)
 +    {
 +        gmx_fatal(FARGS,"Number of weights (%d) is not equal to number of lambda values (%d)",
 +                  nweights,fep->n_lambda);
 +    }
 +    else
 +    {
 +        expand->bInit_weights = TRUE;
 +    }
 +    if ((expand->nstexpanded < 0) && (ir->efep != efepNO)) {
 +        expand->nstexpanded = fep->nstdhdl;
 +        /* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
 +    }
 +    if ((expand->nstexpanded < 0) && ir->bSimTemp) {
 +        expand->nstexpanded = 2*(int)(ir->opts.tau_t[0]/ir->delta_t);
 +        /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
 +           2*tau_t just to be careful so it's not to frequent  */
 +    }
 +}
 +
 +
 +static void do_simtemp_params(t_inputrec *ir) {
 +
 +    snew(ir->simtempvals->temperatures,ir->fepvals->n_lambda);
 +    GetSimTemps(ir->fepvals->n_lambda,ir->simtempvals,ir->fepvals->all_lambda[efptTEMPERATURE]);
 +
 +    return;
 +}
 +
 +static void do_wall_params(t_inputrec *ir,
 +                           char *wall_atomtype, char *wall_density,
 +                           t_gromppopts *opts)
 +{
 +    int  nstr,i;
 +    char *names[MAXPTR];
 +    double dbl;
 +
 +    opts->wall_atomtype[0] = NULL;
 +    opts->wall_atomtype[1] = NULL;
 +
 +    ir->wall_atomtype[0] = -1;
 +    ir->wall_atomtype[1] = -1;
 +    ir->wall_density[0] = 0;
 +    ir->wall_density[1] = 0;
 +  
 +    if (ir->nwall > 0)
 +    {
 +        nstr = str_nelem(wall_atomtype,MAXPTR,names);
 +        if (nstr != ir->nwall)
 +        {
 +            gmx_fatal(FARGS,"Expected %d elements for wall_atomtype, found %d",
 +                      ir->nwall,nstr);
 +        }
 +        for(i=0; i<ir->nwall; i++)
 +        {
 +            opts->wall_atomtype[i] = strdup(names[i]);
 +        }
 +    
 +        if (ir->wall_type == ewt93 || ir->wall_type == ewt104) {
 +            nstr = str_nelem(wall_density,MAXPTR,names);
 +            if (nstr != ir->nwall)
 +            {
 +                gmx_fatal(FARGS,"Expected %d elements for wall-density, found %d",ir->nwall,nstr);
 +            }
 +            for(i=0; i<ir->nwall; i++)
 +            {
 +                sscanf(names[i],"%lf",&dbl);
 +                if (dbl <= 0)
 +                {
 +                    gmx_fatal(FARGS,"wall-density[%d] = %f\n",i,dbl);
 +                }
 +                ir->wall_density[i] = dbl;
 +            }
 +        }
 +    }
 +}
 +
 +static void add_wall_energrps(gmx_groups_t *groups,int nwall,t_symtab *symtab)
 +{
 +  int  i;
 +  t_grps *grps;
 +  char str[STRLEN];
 +  
 +  if (nwall > 0) {
 +    srenew(groups->grpname,groups->ngrpname+nwall);
 +    grps = &(groups->grps[egcENER]);
 +    srenew(grps->nm_ind,grps->nr+nwall);
 +    for(i=0; i<nwall; i++) {
 +      sprintf(str,"wall%d",i);
 +      groups->grpname[groups->ngrpname] = put_symtab(symtab,str);
 +      grps->nm_ind[grps->nr++] = groups->ngrpname++;
 +    }
 +  }
 +}
 +
 +void read_expandedparams(int *ninp_p,t_inpfile **inp_p,
 +                         t_expanded *expand,warninp_t wi)
 +{
 +  int  ninp,nerror=0;
 +  t_inpfile *inp;
 +
 +  ninp   = *ninp_p;
 +  inp    = *inp_p;
 +
 +  /* read expanded ensemble parameters */
 +  CCTYPE ("expanded ensemble variables");
 +  ITYPE ("nstexpanded",expand->nstexpanded,-1);
 +  EETYPE("lmc-stats", expand->elamstats, elamstats_names);
 +  EETYPE("lmc-move", expand->elmcmove, elmcmove_names);
 +  EETYPE("lmc-weights-equil",expand->elmceq,elmceq_names);
 +  ITYPE ("weight-equil-number-all-lambda",expand->equil_n_at_lam,-1);
 +  ITYPE ("weight-equil-number-samples",expand->equil_samples,-1);
 +  ITYPE ("weight-equil-number-steps",expand->equil_steps,-1);
 +  RTYPE ("weight-equil-wl-delta",expand->equil_wl_delta,-1);
 +  RTYPE ("weight-equil-count-ratio",expand->equil_ratio,-1);
 +  CCTYPE("Seed for Monte Carlo in lambda space");
 +  ITYPE ("lmc-seed",expand->lmc_seed,-1);
 +  RTYPE ("mc-temperature",expand->mc_temp,-1);
 +  ITYPE ("lmc-repeats",expand->lmc_repeats,1);
 +  ITYPE ("lmc-gibbsdelta",expand->gibbsdeltalam,-1);
 +  ITYPE ("lmc-forced-nstart",expand->lmc_forced_nstart,0);
 +  EETYPE("symmetrized-transition-matrix", expand->bSymmetrizedTMatrix, yesno_names);
 +  ITYPE("nst-transition-matrix", expand->nstTij, -1);
 +  ITYPE ("mininum-var-min",expand->minvarmin, 100); /*default is reasonable */
 +  ITYPE ("weight-c-range",expand->c_range, 0); /* default is just C=0 */
 +  RTYPE ("wl-scale",expand->wl_scale,0.8);
 +  RTYPE ("wl-ratio",expand->wl_ratio,0.8);
 +  RTYPE ("init-wl-delta",expand->init_wl_delta,1.0);
 +  EETYPE("wl-oneovert",expand->bWLoneovert,yesno_names);
 +
 +  *ninp_p   = ninp;
 +  *inp_p    = inp;
 +
 +  return;
 +}
 +
 +void get_ir(const char *mdparin,const char *mdparout,
 +            t_inputrec *ir,t_gromppopts *opts,
 +            warninp_t wi)
 +{
 +  char      *dumstr[2];
 +  double    dumdub[2][6];
 +  t_inpfile *inp;
 +  const char *tmp;
 +  int       i,j,m,ninp;
 +  char      warn_buf[STRLEN];
 +  t_lambda  *fep = ir->fepvals;
 +  t_expanded *expand = ir->expandedvals;
 +
 +  inp = read_inpfile(mdparin, &ninp, NULL, wi);
 +
 +  snew(dumstr[0],STRLEN);
 +  snew(dumstr[1],STRLEN);
 +
 +  /* remove the following deprecated commands */
 +  REM_TYPE("title");
 +  REM_TYPE("cpp");
 +  REM_TYPE("domain-decomposition");
 +  REM_TYPE("andersen-seed");
 +  REM_TYPE("dihre");
 +  REM_TYPE("dihre-fc");
 +  REM_TYPE("dihre-tau");
 +  REM_TYPE("nstdihreout");
 +  REM_TYPE("nstcheckpoint");
 +
 +  /* replace the following commands with the clearer new versions*/
 +  REPL_TYPE("unconstrained-start","continuation");
 +  REPL_TYPE("foreign-lambda","fep-lambdas");
 +
 +  CCTYPE ("VARIOUS PREPROCESSING OPTIONS");
 +  CTYPE ("Preprocessor information: use cpp syntax.");
 +  CTYPE ("e.g.: -I/home/joe/doe -I/home/mary/roe");
 +  STYPE ("include",   opts->include,  NULL);
 +  CTYPE ("e.g.: -DPOSRES -DFLEXIBLE (note these variable names are case sensitive)");
 +  STYPE ("define",    opts->define,   NULL);
 +    
 +  CCTYPE ("RUN CONTROL PARAMETERS");
 +  EETYPE("integrator",  ir->eI,         ei_names);
 +  CTYPE ("Start time and timestep in ps");
 +  RTYPE ("tinit",     ir->init_t,     0.0);
 +  RTYPE ("dt",                ir->delta_t,    0.001);
 +  STEPTYPE ("nsteps",   ir->nsteps,     0);
 +  CTYPE ("For exact run continuation or redoing part of a run");
 +  STEPTYPE ("init-step",ir->init_step,  0);
 +  CTYPE ("Part index is updated automatically on checkpointing (keeps files separate)");
 +  ITYPE ("simulation-part", ir->simulation_part, 1);
 +  CTYPE ("mode for center of mass motion removal");
 +  EETYPE("comm-mode",   ir->comm_mode,  ecm_names);
 +  CTYPE ("number of steps for center of mass motion removal");
 +  ITYPE ("nstcomm",   ir->nstcomm,    100);
 +  CTYPE ("group(s) for center of mass motion removal");
 +  STYPE ("comm-grps",   vcm,            NULL);
 +  
 +  CCTYPE ("LANGEVIN DYNAMICS OPTIONS");
 +  CTYPE ("Friction coefficient (amu/ps) and random seed");
 +  RTYPE ("bd-fric",     ir->bd_fric,    0.0);
 +  ITYPE ("ld-seed",     ir->ld_seed,    1993);
 +  
 +  /* Em stuff */
 +  CCTYPE ("ENERGY MINIMIZATION OPTIONS");
 +  CTYPE ("Force tolerance and initial step-size");
 +  RTYPE ("emtol",       ir->em_tol,     10.0);
 +  RTYPE ("emstep",      ir->em_stepsize,0.01);
 +  CTYPE ("Max number of iterations in relax-shells");
 +  ITYPE ("niter",       ir->niter,      20);
 +  CTYPE ("Step size (ps^2) for minimization of flexible constraints");
 +  RTYPE ("fcstep",      ir->fc_stepsize, 0);
 +  CTYPE ("Frequency of steepest descents steps when doing CG");
 +  ITYPE ("nstcgsteep",        ir->nstcgsteep, 1000);
 +  ITYPE ("nbfgscorr",   ir->nbfgscorr,  10); 
 +
 +  CCTYPE ("TEST PARTICLE INSERTION OPTIONS");
 +  RTYPE ("rtpi",      ir->rtpi,       0.05);
 +
 +  /* Output options */
 +  CCTYPE ("OUTPUT CONTROL OPTIONS");
 +  CTYPE ("Output frequency for coords (x), velocities (v) and forces (f)");
 +  ITYPE ("nstxout",   ir->nstxout,    0);
 +  ITYPE ("nstvout",   ir->nstvout,    0);
 +  ITYPE ("nstfout",   ir->nstfout,    0);
 +  ir->nstcheckpoint = 1000;
 +  CTYPE ("Output frequency for energies to log file and energy file");
 +  ITYPE ("nstlog",    ir->nstlog,     1000);
 +  ITYPE ("nstcalcenergy",ir->nstcalcenergy,   100);
 +  ITYPE ("nstenergy",   ir->nstenergy,  1000);
 +  CTYPE ("Output frequency and precision for .xtc file");
 +  ITYPE ("nstxtcout",   ir->nstxtcout,  0);
 +  RTYPE ("xtc-precision",ir->xtcprec,   1000.0);
 +  CTYPE ("This selects the subset of atoms for the .xtc file. You can");
 +  CTYPE ("select multiple groups. By default all atoms will be written.");
 +  STYPE ("xtc-grps",    xtc_grps,       NULL);
 +  CTYPE ("Selection of energy groups");
 +  STYPE ("energygrps",  energy,         NULL);
 +
 +  /* Neighbor searching */  
 +  CCTYPE ("NEIGHBORSEARCHING PARAMETERS");
 +  CTYPE ("cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)");
 +  EETYPE("cutoff-scheme",     ir->cutoff_scheme,    ecutscheme_names);
 +  CTYPE ("nblist update frequency");
 +  ITYPE ("nstlist",   ir->nstlist,    10);
 +  CTYPE ("ns algorithm (simple or grid)");
 +  EETYPE("ns-type",     ir->ns_type,    ens_names);
 +  /* set ndelta to the optimal value of 2 */
 +  ir->ndelta = 2;
 +  CTYPE ("Periodic boundary conditions: xyz, no, xy");
 +  EETYPE("pbc",         ir->ePBC,       epbc_names);
 +  EETYPE("periodic-molecules", ir->bPeriodicMols, yesno_names);
 +  CTYPE ("Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,");
 +  CTYPE ("a value of -1 means: use rlist");
 +  RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
 +  CTYPE ("nblist cut-off");
 +  RTYPE ("rlist",     ir->rlist,      1.0);
 +  CTYPE ("long-range cut-off for switched potentials");
 +  RTYPE ("rlistlong", ir->rlistlong,  -1);
 +  ITYPE ("nstcalclr", ir->nstcalclr,  -1);
 +
 +  /* Electrostatics */
 +  CCTYPE ("OPTIONS FOR ELECTROSTATICS AND VDW");
 +  CTYPE ("Method for doing electrostatics");
 +  EETYPE("coulombtype",       ir->coulombtype,    eel_names);
 +  EETYPE("coulomb-modifier",  ir->coulomb_modifier,    eintmod_names);
 +  CTYPE ("cut-off lengths");
 +  RTYPE ("rcoulomb-switch",   ir->rcoulomb_switch,    0.0);
 +  RTYPE ("rcoulomb",  ir->rcoulomb,   1.0);
 +  CTYPE ("Relative dielectric constant for the medium and the reaction field");
 +  RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
 +  RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
 +  CTYPE ("Method for doing Van der Waals");
 +  EETYPE("vdw-type",  ir->vdwtype,    evdw_names);
 +  EETYPE("vdw-modifier",      ir->vdw_modifier,    eintmod_names);
 +  CTYPE ("cut-off lengths");
 +  RTYPE ("rvdw-switch",       ir->rvdw_switch,        0.0);
 +  RTYPE ("rvdw",      ir->rvdw,       1.0);
 +  CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
 +  EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
 +  CTYPE ("Extension of the potential lookup tables beyond the cut-off");
 +  RTYPE ("table-extension", ir->tabext, 1.0);
 +  CTYPE ("Separate tables between energy group pairs");
 +  STYPE ("energygrp-table", egptable,   NULL);
 +  CTYPE ("Spacing for the PME/PPPM FFT grid");
 +  RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
 +  CTYPE ("FFT grid size, when a value is 0 fourierspacing will be used");
 +  ITYPE ("fourier-nx",  ir->nkx,         0);
 +  ITYPE ("fourier-ny",  ir->nky,         0);
 +  ITYPE ("fourier-nz",  ir->nkz,         0);
 +  CTYPE ("EWALD/PME/PPPM parameters");
 +  ITYPE ("pme-order",   ir->pme_order,   4);
 +  RTYPE ("ewald-rtol",  ir->ewald_rtol, 0.00001);
 +  EETYPE("ewald-geometry", ir->ewald_geometry, eewg_names);
 +  RTYPE ("epsilon-surface", ir->epsilon_surface, 0.0);
 +  EETYPE("optimize-fft",ir->bOptFFT,  yesno_names);
 +
 +  CCTYPE("IMPLICIT SOLVENT ALGORITHM");
 +  EETYPE("implicit-solvent", ir->implicit_solvent, eis_names);
 +      
 +  CCTYPE ("GENERALIZED BORN ELECTROSTATICS"); 
 +  CTYPE ("Algorithm for calculating Born radii");
 +  EETYPE("gb-algorithm", ir->gb_algorithm, egb_names);
 +  CTYPE ("Frequency of calculating the Born radii inside rlist");
 +  ITYPE ("nstgbradii", ir->nstgbradii, 1);
 +  CTYPE ("Cutoff for Born radii calculation; the contribution from atoms");
 +  CTYPE ("between rlist and rgbradii is updated every nstlist steps");
 +  RTYPE ("rgbradii",  ir->rgbradii, 1.0);
 +  CTYPE ("Dielectric coefficient of the implicit solvent");
 +  RTYPE ("gb-epsilon-solvent",ir->gb_epsilon_solvent, 80.0);
 +  CTYPE ("Salt concentration in M for Generalized Born models");
 +  RTYPE ("gb-saltconc",  ir->gb_saltconc, 0.0);
 +  CTYPE ("Scaling factors used in the OBC GB model. Default values are OBC(II)");
 +  RTYPE ("gb-obc-alpha", ir->gb_obc_alpha, 1.0);
 +  RTYPE ("gb-obc-beta", ir->gb_obc_beta, 0.8);
 +  RTYPE ("gb-obc-gamma", ir->gb_obc_gamma, 4.85);
 +  RTYPE ("gb-dielectric-offset", ir->gb_dielectric_offset, 0.009);
 +  EETYPE("sa-algorithm", ir->sa_algorithm, esa_names);
 +  CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
 +  CTYPE ("The value -1 will set default value for Still/HCT/OBC GB-models.");
 +  RTYPE ("sa-surface-tension", ir->sa_surface_tension, -1);
 +               
 +  /* Coupling stuff */
 +  CCTYPE ("OPTIONS FOR WEAK COUPLING ALGORITHMS");
 +  CTYPE ("Temperature coupling");
 +  EETYPE("tcoupl",    ir->etc,        etcoupl_names);
 +  ITYPE ("nsttcouple", ir->nsttcouple,  -1);
 +  ITYPE("nh-chain-length",     ir->opts.nhchainlength, NHCHAINLENGTH);
 +  EETYPE("print-nose-hoover-chain-variables", ir->bPrintNHChains, yesno_names);
 +  CTYPE ("Groups to couple separately");
 +  STYPE ("tc-grps",     tcgrps,         NULL);
 +  CTYPE ("Time constant (ps) and reference temperature (K)");
 +  STYPE ("tau-t",     tau_t,          NULL);
 +  STYPE ("ref-t",     ref_t,          NULL);
 +  CTYPE ("pressure coupling");
 +  EETYPE("pcoupl",    ir->epc,        epcoupl_names);
 +  EETYPE("pcoupltype",        ir->epct,       epcoupltype_names);
 +  ITYPE ("nstpcouple", ir->nstpcouple,  -1);
 +  CTYPE ("Time constant (ps), compressibility (1/bar) and reference P (bar)");
 +  RTYPE ("tau-p",     ir->tau_p,      1.0);
 +  STYPE ("compressibility",   dumstr[0],      NULL);
 +  STYPE ("ref-p",       dumstr[1],      NULL);
 +  CTYPE ("Scaling of reference coordinates, No, All or COM");
 +  EETYPE ("refcoord-scaling",ir->refcoord_scaling,erefscaling_names);
 +
 +  /* QMMM */
 +  CCTYPE ("OPTIONS FOR QMMM calculations");
 +  EETYPE("QMMM", ir->bQMMM, yesno_names);
 +  CTYPE ("Groups treated Quantum Mechanically");
 +  STYPE ("QMMM-grps",  QMMM,          NULL);
 +  CTYPE ("QM method");
 +  STYPE("QMmethod",     QMmethod, NULL);
 +  CTYPE ("QMMM scheme");
 +  EETYPE("QMMMscheme",  ir->QMMMscheme,    eQMMMscheme_names);
 +  CTYPE ("QM basisset");
 +  STYPE("QMbasis",      QMbasis, NULL);
 +  CTYPE ("QM charge");
 +  STYPE ("QMcharge",    QMcharge,NULL);
 +  CTYPE ("QM multiplicity");
 +  STYPE ("QMmult",      QMmult,NULL);
 +  CTYPE ("Surface Hopping");
 +  STYPE ("SH",          bSH, NULL);
 +  CTYPE ("CAS space options");
 +  STYPE ("CASorbitals",      CASorbitals,   NULL);
 +  STYPE ("CASelectrons",     CASelectrons,  NULL);
 +  STYPE ("SAon", SAon, NULL);
 +  STYPE ("SAoff",SAoff,NULL);
 +  STYPE ("SAsteps",  SAsteps, NULL);
 +  CTYPE ("Scale factor for MM charges");
 +  RTYPE ("MMChargeScaleFactor", ir->scalefactor, 1.0);
 +  CTYPE ("Optimization of QM subsystem");
 +  STYPE ("bOPT",          bOPT, NULL);
 +  STYPE ("bTS",          bTS, NULL);
 +
 +  /* Simulated annealing */
 +  CCTYPE("SIMULATED ANNEALING");
 +  CTYPE ("Type of annealing for each temperature group (no/single/periodic)");
 +  STYPE ("annealing",   anneal,      NULL);
 +  CTYPE ("Number of time points to use for specifying annealing in each group");
 +  STYPE ("annealing-npoints", anneal_npoints, NULL);
 +  CTYPE ("List of times at the annealing points for each group");
 +  STYPE ("annealing-time",       anneal_time,       NULL);
 +  CTYPE ("Temp. at each annealing point, for each group.");
 +  STYPE ("annealing-temp",  anneal_temp,  NULL);
 +  
 +  /* Startup run */
 +  CCTYPE ("GENERATE VELOCITIES FOR STARTUP RUN");
 +  EETYPE("gen-vel",     opts->bGenVel,  yesno_names);
 +  RTYPE ("gen-temp",    opts->tempi,    300.0);
 +  ITYPE ("gen-seed",    opts->seed,     173529);
 +  
 +  /* Shake stuff */
 +  CCTYPE ("OPTIONS FOR BONDS");
 +  EETYPE("constraints",       opts->nshake,   constraints);
 +  CTYPE ("Type of constraint algorithm");
 +  EETYPE("constraint-algorithm",  ir->eConstrAlg, econstr_names);
 +  CTYPE ("Do not constrain the start configuration");
 +  EETYPE("continuation", ir->bContinuation, yesno_names);
 +  CTYPE ("Use successive overrelaxation to reduce the number of shake iterations");
 +  EETYPE("Shake-SOR", ir->bShakeSOR, yesno_names);
 +  CTYPE ("Relative tolerance of shake");
 +  RTYPE ("shake-tol", ir->shake_tol, 0.0001);
 +  CTYPE ("Highest order in the expansion of the constraint coupling matrix");
 +  ITYPE ("lincs-order", ir->nProjOrder, 4);
 +  CTYPE ("Number of iterations in the final step of LINCS. 1 is fine for");
 +  CTYPE ("normal simulations, but use 2 to conserve energy in NVE runs.");
 +  CTYPE ("For energy minimization with constraints it should be 4 to 8.");
 +  ITYPE ("lincs-iter", ir->nLincsIter, 1);
 +  CTYPE ("Lincs will write a warning to the stderr if in one step a bond"); 
 +  CTYPE ("rotates over more degrees than");
 +  RTYPE ("lincs-warnangle", ir->LincsWarnAngle, 30.0);
 +  CTYPE ("Convert harmonic bonds to morse potentials");
 +  EETYPE("morse",       opts->bMorse,yesno_names);
 +
 +  /* Energy group exclusions */
 +  CCTYPE ("ENERGY GROUP EXCLUSIONS");
 +  CTYPE ("Pairs of energy groups for which all non-bonded interactions are excluded");
 +  STYPE ("energygrp-excl", egpexcl,     NULL);
 +  
 +  /* Walls */
 +  CCTYPE ("WALLS");
 +  CTYPE ("Number of walls, type, atom types, densities and box-z scale factor for Ewald");
 +  ITYPE ("nwall", ir->nwall, 0);
 +  EETYPE("wall-type",     ir->wall_type,   ewt_names);
 +  RTYPE ("wall-r-linpot", ir->wall_r_linpot, -1);
 +  STYPE ("wall-atomtype", wall_atomtype, NULL);
 +  STYPE ("wall-density",  wall_density,  NULL);
 +  RTYPE ("wall-ewald-zfac", ir->wall_ewald_zfac, 3);
 +  
 +  /* COM pulling */
 +  CCTYPE("COM PULLING");
 +  CTYPE("Pull type: no, umbrella, constraint or constant-force");
 +  EETYPE("pull",          ir->ePull, epull_names);
 +  if (ir->ePull != epullNO) {
 +    snew(ir->pull,1);
 +    pull_grp = read_pullparams(&ninp,&inp,ir->pull,&opts->pull_start,wi);
 +  }
 +  
 +  /* Enforced rotation */
 +  CCTYPE("ENFORCED ROTATION");
 +  CTYPE("Enforced rotation: No or Yes");
 +  EETYPE("rotation",       ir->bRot, yesno_names);
 +  if (ir->bRot) {
 +    snew(ir->rot,1);
 +    rot_grp = read_rotparams(&ninp,&inp,ir->rot,wi);
 +  }
 +
 +  /* Refinement */
 +  CCTYPE("NMR refinement stuff");
 +  CTYPE ("Distance restraints type: No, Simple or Ensemble");
 +  EETYPE("disre",       ir->eDisre,     edisre_names);
 +  CTYPE ("Force weighting of pairs in one distance restraint: Conservative or Equal");
 +  EETYPE("disre-weighting", ir->eDisreWeighting, edisreweighting_names);
 +  CTYPE ("Use sqrt of the time averaged times the instantaneous violation");
 +  EETYPE("disre-mixed", ir->bDisreMixed, yesno_names);
 +  RTYPE ("disre-fc",  ir->dr_fc,      1000.0);
 +  RTYPE ("disre-tau", ir->dr_tau,     0.0);
 +  CTYPE ("Output frequency for pair distances to energy file");
 +  ITYPE ("nstdisreout", ir->nstdisreout, 100);
 +  CTYPE ("Orientation restraints: No or Yes");
 +  EETYPE("orire",       opts->bOrire,   yesno_names);
 +  CTYPE ("Orientation restraints force constant and tau for time averaging");
 +  RTYPE ("orire-fc",  ir->orires_fc,  0.0);
 +  RTYPE ("orire-tau", ir->orires_tau, 0.0);
 +  STYPE ("orire-fitgrp",orirefitgrp,    NULL);
 +  CTYPE ("Output frequency for trace(SD) and S to energy file");
 +  ITYPE ("nstorireout", ir->nstorireout, 100);
 +
 +  /* free energy variables */
 +  CCTYPE ("Free energy variables");
 +  EETYPE("free-energy", ir->efep, efep_names);
 +  STYPE ("couple-moltype",  couple_moltype,  NULL);
 +  EETYPE("couple-lambda0", opts->couple_lam0, couple_lam);
 +  EETYPE("couple-lambda1", opts->couple_lam1, couple_lam);
 +  EETYPE("couple-intramol", opts->bCoupleIntra, yesno_names);
 +
 +  RTYPE ("init-lambda", fep->init_lambda,-1); /* start with -1 so
 +                                                 we can recognize if
 +                                                 it was not entered */
-   ITYPE ("nstdhdl",fep->nstdhdl, 100);
++  ITYPE ("init-lambda-state", fep->init_fep_state,-1);
 +  RTYPE ("delta-lambda",fep->delta_lambda,0.0);
-               int mincouple;
-               mincouple = ir->nsttcouple;
-               if (ir->nstpcouple < mincouple)
++  ITYPE ("nstdhdl",fep->nstdhdl, 50);
 +  STYPE ("fep-lambdas", fep_lambda[efptFEP], NULL);
 +  STYPE ("mass-lambdas", fep_lambda[efptMASS], NULL);
 +  STYPE ("coul-lambdas", fep_lambda[efptCOUL], NULL);
 +  STYPE ("vdw-lambdas", fep_lambda[efptVDW], NULL);
 +  STYPE ("bonded-lambdas", fep_lambda[efptBONDED], NULL);
 +  STYPE ("restraint-lambdas", fep_lambda[efptRESTRAINT], NULL);
 +  STYPE ("temperature-lambdas", fep_lambda[efptTEMPERATURE], NULL);
++  ITYPE ("calc-lambda-neighbors",fep->lambda_neighbors, 1);
 +  STYPE ("init-lambda-weights",lambda_weights,NULL);
 +  EETYPE("dhdl-print-energy", fep->bPrintEnergy, yesno_names);
 +  RTYPE ("sc-alpha",fep->sc_alpha,0.0);
 +  ITYPE ("sc-power",fep->sc_power,1);
 +  RTYPE ("sc-r-power",fep->sc_r_power,6.0);
 +  RTYPE ("sc-sigma",fep->sc_sigma,0.3);
 +  EETYPE("sc-coul",fep->bScCoul,yesno_names);
 +  ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +  RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +  EETYPE("separate-dhdl-file", fep->separate_dhdl_file,
 +                               separate_dhdl_file_names);
 +  EETYPE("dhdl-derivatives", fep->dhdl_derivatives, dhdl_derivatives_names);
 +  ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +  RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +
 +  /* Non-equilibrium MD stuff */  
 +  CCTYPE("Non-equilibrium MD stuff");
 +  STYPE ("acc-grps",    accgrps,        NULL);
 +  STYPE ("accelerate",  acc,            NULL);
 +  STYPE ("freezegrps",  freeze,         NULL);
 +  STYPE ("freezedim",   frdim,          NULL);
 +  RTYPE ("cos-acceleration", ir->cos_accel, 0);
 +  STYPE ("deform",      deform,         NULL);
 +
 +  /* simulated tempering variables */
 +  CCTYPE("simulated tempering variables");
 +  EETYPE("simulated-tempering",ir->bSimTemp,yesno_names);
 +  EETYPE("simulated-tempering-scaling",ir->simtempvals->eSimTempScale,esimtemp_names);
 +  RTYPE("sim-temp-low",ir->simtempvals->simtemp_low,300.0);
 +  RTYPE("sim-temp-high",ir->simtempvals->simtemp_high,300.0);
 +
 +  /* expanded ensemble variables */
 +  if (ir->efep==efepEXPANDED || ir->bSimTemp)
 +  {
 +      read_expandedparams(&ninp,&inp,expand,wi);
 +  }
 +
 +  /* Electric fields */
 +  CCTYPE("Electric fields");
 +  CTYPE ("Format is number of terms (int) and for all terms an amplitude (real)");
 +  CTYPE ("and a phase angle (real)");
 +  STYPE ("E-x",       efield_x,       NULL);
 +  STYPE ("E-xt",      efield_xt,      NULL);
 +  STYPE ("E-y",       efield_y,       NULL);
 +  STYPE ("E-yt",      efield_yt,      NULL);
 +  STYPE ("E-z",       efield_z,       NULL);
 +  STYPE ("E-zt",      efield_zt,      NULL);
 +  
 +  /* AdResS defined thingies */
 +  CCTYPE ("AdResS parameters");
 +  EETYPE("adress",       ir->bAdress, yesno_names);
 +  if (ir->bAdress) {
 +    snew(ir->adress,1);
 +    read_adressparams(&ninp,&inp,ir->adress,wi);
 +  }
 +
 +  /* User defined thingies */
 +  CCTYPE ("User defined thingies");
 +  STYPE ("user1-grps",  user1,          NULL);
 +  STYPE ("user2-grps",  user2,          NULL);
 +  ITYPE ("userint1",    ir->userint1,   0);
 +  ITYPE ("userint2",    ir->userint2,   0);
 +  ITYPE ("userint3",    ir->userint3,   0);
 +  ITYPE ("userint4",    ir->userint4,   0);
 +  RTYPE ("userreal1",   ir->userreal1,  0);
 +  RTYPE ("userreal2",   ir->userreal2,  0);
 +  RTYPE ("userreal3",   ir->userreal3,  0);
 +  RTYPE ("userreal4",   ir->userreal4,  0);
 +#undef CTYPE
 +
 +  write_inpfile(mdparout,ninp,inp,FALSE,wi);
 +  for (i=0; (i<ninp); i++) {
 +    sfree(inp[i].name);
 +    sfree(inp[i].value);
 +  }
 +  sfree(inp);
 +
 +  /* Process options if necessary */
 +  for(m=0; m<2; m++) {
 +    for(i=0; i<2*DIM; i++)
 +      dumdub[m][i]=0.0;
 +    if(ir->epc) {
 +      switch (ir->epct) {
 +      case epctISOTROPIC:
 +      if (sscanf(dumstr[m],"%lf",&(dumdub[m][XX]))!=1) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 1)");
 +      }
 +      dumdub[m][YY]=dumdub[m][ZZ]=dumdub[m][XX];
 +      break;
 +      case epctSEMIISOTROPIC:
 +      case epctSURFACETENSION:
 +      if (sscanf(dumstr[m],"%lf%lf",
 +                 &(dumdub[m][XX]),&(dumdub[m][ZZ]))!=2) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 2)");
 +      }
 +      dumdub[m][YY]=dumdub[m][XX];
 +      break;
 +      case epctANISOTROPIC:
 +      if (sscanf(dumstr[m],"%lf%lf%lf%lf%lf%lf",
 +                 &(dumdub[m][XX]),&(dumdub[m][YY]),&(dumdub[m][ZZ]),
 +                 &(dumdub[m][3]),&(dumdub[m][4]),&(dumdub[m][5]))!=6) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 6)");
 +      }
 +      break;
 +      default:
 +      gmx_fatal(FARGS,"Pressure coupling type %s not implemented yet",
 +                  epcoupltype_names[ir->epct]);
 +      }
 +    }
 +  }
 +  clear_mat(ir->ref_p);
 +  clear_mat(ir->compress);
 +  for(i=0; i<DIM; i++) {
 +    ir->ref_p[i][i]    = dumdub[1][i];
 +    ir->compress[i][i] = dumdub[0][i];
 +  }
 +  if (ir->epct == epctANISOTROPIC) {
 +    ir->ref_p[XX][YY] = dumdub[1][3];
 +    ir->ref_p[XX][ZZ] = dumdub[1][4];
 +    ir->ref_p[YY][ZZ] = dumdub[1][5];
 +    if (ir->ref_p[XX][YY]!=0 && ir->ref_p[XX][ZZ]!=0 && ir->ref_p[YY][ZZ]!=0) {
 +      warning(wi,"All off-diagonal reference pressures are non-zero. Are you sure you want to apply a threefold shear stress?\n");
 +    }
 +    ir->compress[XX][YY] = dumdub[0][3];
 +    ir->compress[XX][ZZ] = dumdub[0][4];
 +    ir->compress[YY][ZZ] = dumdub[0][5];
 +    for(i=0; i<DIM; i++) {
 +      for(m=0; m<i; m++) {
 +      ir->ref_p[i][m] = ir->ref_p[m][i];
 +      ir->compress[i][m] = ir->compress[m][i];
 +      }
 +    }
 +  } 
 +  
 +  if (ir->comm_mode == ecmNO)
 +    ir->nstcomm = 0;
 +
 +  opts->couple_moltype = NULL;
 +  if (strlen(couple_moltype) > 0) 
 +  {
 +      if (ir->efep != efepNO) 
 +      {
 +          opts->couple_moltype = strdup(couple_moltype);
 +          if (opts->couple_lam0 == opts->couple_lam1)
 +          {
 +              warning(wi,"The lambda=0 and lambda=1 states for coupling are identical");
 +          }
 +          if (ir->eI == eiMD && (opts->couple_lam0 == ecouplamNONE ||
 +                                 opts->couple_lam1 == ecouplamNONE)) 
 +          {
 +              warning(wi,"For proper sampling of the (nearly) decoupled state, stochastic dynamics should be used");
 +          }
 +      }
 +      else
 +      {
 +          warning(wi,"Can not couple a molecule with free_energy = no");
 +      }
 +  }
 +  /* FREE ENERGY AND EXPANDED ENSEMBLE OPTIONS */
 +  if (ir->efep != efepNO) {
 +      if (fep->delta_lambda > 0) {
 +          ir->efep = efepSLOWGROWTH;
 +      }
 +  }
 +
 +  if (ir->bSimTemp) {
 +      fep->bPrintEnergy = TRUE;
 +      /* always print out the energy to dhdl if we are doing expanded ensemble, since we need the total energy
 +         if the temperature is changing. */
 +  }
 +
 +  if ((ir->efep != efepNO) || ir->bSimTemp)
 +  {
 +      ir->bExpanded = FALSE;
 +      if ((ir->efep == efepEXPANDED) || ir->bSimTemp)
 +      {
 +          ir->bExpanded = TRUE;
 +      }
 +      do_fep_params(ir,fep_lambda,lambda_weights);
 +      if (ir->bSimTemp) { /* done after fep params */
 +          do_simtemp_params(ir);
 +      }
 +  }
 +  else
 +  {
 +      ir->fepvals->n_lambda = 0;
 +  }
 +
 +  /* WALL PARAMETERS */
 +
 +  do_wall_params(ir,wall_atomtype,wall_density,opts);
 +
 +  /* ORIENTATION RESTRAINT PARAMETERS */
 +  
 +  if (opts->bOrire && str_nelem(orirefitgrp,MAXPTR,NULL)!=1) {
 +      warning_error(wi,"ERROR: Need one orientation restraint fit group\n");
 +  }
 +
 +  /* DEFORMATION PARAMETERS */
 +
 +  clear_mat(ir->deform);
 +  for(i=0; i<6; i++)
 +  {
 +      dumdub[0][i] = 0;
 +  }
 +  m = sscanf(deform,"%lf %lf %lf %lf %lf %lf",
 +           &(dumdub[0][0]),&(dumdub[0][1]),&(dumdub[0][2]),
 +           &(dumdub[0][3]),&(dumdub[0][4]),&(dumdub[0][5]));
 +  for(i=0; i<3; i++)
 +  {
 +      ir->deform[i][i] = dumdub[0][i];
 +  }
 +  ir->deform[YY][XX] = dumdub[0][3];
 +  ir->deform[ZZ][XX] = dumdub[0][4];
 +  ir->deform[ZZ][YY] = dumdub[0][5];
 +  if (ir->epc != epcNO) {
 +    for(i=0; i<3; i++)
 +      for(j=0; j<=i; j++)
 +      if (ir->deform[i][j]!=0 && ir->compress[i][j]!=0) {
 +        warning_error(wi,"A box element has deform set and compressibility > 0");
 +      }
 +    for(i=0; i<3; i++)
 +      for(j=0; j<i; j++)
 +      if (ir->deform[i][j]!=0) {
 +        for(m=j; m<DIM; m++)
 +          if (ir->compress[m][j]!=0) {
 +            sprintf(warn_buf,"An off-diagonal box element has deform set while compressibility > 0 for the same component of another box vector, this might lead to spurious periodicity effects.");
 +            warning(wi,warn_buf);
 +          }
 +      }
 +  }
 +
 +  sfree(dumstr[0]);
 +  sfree(dumstr[1]);
 +}
 +
 +static int search_QMstring(char *s,int ng,const char *gn[])
 +{
 +  /* same as normal search_string, but this one searches QM strings */
 +  int i;
 +
 +  for(i=0; (i<ng); i++)
 +    if (gmx_strcasecmp(s,gn[i]) == 0)
 +      return i;
 +
 +  gmx_fatal(FARGS,"this QM method or basisset (%s) is not implemented\n!",s);
 +
 +  return -1;
 +
 +} /* search_QMstring */
 +
 +
 +int search_string(char *s,int ng,char *gn[])
 +{
 +  int i;
 +  
 +  for(i=0; (i<ng); i++)
 +  {
 +    if (gmx_strcasecmp(s,gn[i]) == 0)
 +    {
 +      return i;
 +    }
 +  }
 +    
 +  gmx_fatal(FARGS,
 +            "Group %s referenced in the .mdp file was not found in the index file.\n"
 +            "Group names must match either [moleculetype] names or custom index group\n"
 +            "names, in which case you must supply an index file to the '-n' option\n"
 +            "of grompp.",
 +            s);
 +  
 +  return -1;
 +}
 +
 +static gmx_bool do_numbering(int natoms,gmx_groups_t *groups,int ng,char *ptrs[],
 +                         t_blocka *block,char *gnames[],
 +                         int gtype,int restnm,
 +                         int grptp,gmx_bool bVerbose,
 +                         warninp_t wi)
 +{
 +    unsigned short *cbuf;
 +    t_grps *grps=&(groups->grps[gtype]);
 +    int    i,j,gid,aj,ognr,ntot=0;
 +    const char *title;
 +    gmx_bool   bRest;
 +    char   warn_buf[STRLEN];
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Starting numbering %d groups of type %d\n",ng,gtype);
 +    }
 +  
 +    title = gtypes[gtype];
 +    
 +    snew(cbuf,natoms);
 +    /* Mark all id's as not set */
 +    for(i=0; (i<natoms); i++)
 +    {
 +        cbuf[i] = NOGID;
 +    }
 +  
 +    snew(grps->nm_ind,ng+1); /* +1 for possible rest group */
 +    for(i=0; (i<ng); i++)
 +    {
 +        /* Lookup the group name in the block structure */
 +        gid = search_string(ptrs[i],block->nr,gnames);
 +        if ((grptp != egrptpONE) || (i == 0))
 +        {
 +            grps->nm_ind[grps->nr++]=gid;
 +        }
 +        if (debug) 
 +        {
 +            fprintf(debug,"Found gid %d for group %s\n",gid,ptrs[i]);
 +        }
 +    
 +        /* Now go over the atoms in the group */
 +        for(j=block->index[gid]; (j<block->index[gid+1]); j++)
 +        {
 +
 +            aj=block->a[j];
 +      
 +            /* Range checking */
 +            if ((aj < 0) || (aj >= natoms)) 
 +            {
 +                gmx_fatal(FARGS,"Invalid atom number %d in indexfile",aj);
 +            }
 +            /* Lookup up the old group number */
 +            ognr = cbuf[aj];
 +            if (ognr != NOGID)
 +            {
 +                gmx_fatal(FARGS,"Atom %d in multiple %s groups (%d and %d)",
 +                          aj+1,title,ognr+1,i+1);
 +            }
 +            else
 +            {
 +                /* Store the group number in buffer */
 +                if (grptp == egrptpONE)
 +                {
 +                    cbuf[aj] = 0;
 +                }
 +                else
 +                {
 +                    cbuf[aj] = i;
 +                }
 +                ntot++;
 +            }
 +        }
 +    }
 +    
 +    /* Now check whether we have done all atoms */
 +    bRest = FALSE;
 +    if (ntot != natoms)
 +    {
 +        if (grptp == egrptpALL)
 +        {
 +            gmx_fatal(FARGS,"%d atoms are not part of any of the %s groups",
 +                      natoms-ntot,title);
 +        }
 +        else if (grptp == egrptpPART)
 +        {
 +            sprintf(warn_buf,"%d atoms are not part of any of the %s groups",
 +                    natoms-ntot,title);
 +            warning_note(wi,warn_buf);
 +        }
 +        /* Assign all atoms currently unassigned to a rest group */
 +        for(j=0; (j<natoms); j++)
 +        {
 +            if (cbuf[j] == NOGID)
 +            {
 +                cbuf[j] = grps->nr;
 +                bRest = TRUE;
 +            }
 +        }
 +        if (grptp != egrptpPART)
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,
 +                        "Making dummy/rest group for %s containing %d elements\n",
 +                        title,natoms-ntot);
 +            }
 +            /* Add group name "rest" */ 
 +            grps->nm_ind[grps->nr] = restnm;
 +            
 +            /* Assign the rest name to all atoms not currently assigned to a group */
 +            for(j=0; (j<natoms); j++)
 +            {
 +                if (cbuf[j] == NOGID)
 +                {
 +                    cbuf[j] = grps->nr;
 +                }
 +            }
 +            grps->nr++;
 +        }
 +    }
 +    
 +    if (grps->nr == 1 && (ntot == 0 || ntot == natoms))
 +    {
 +        /* All atoms are part of one (or no) group, no index required */
 +        groups->ngrpnr[gtype] = 0;
 +        groups->grpnr[gtype]  = NULL;
 +    }
 +    else
 +    {
 +        groups->ngrpnr[gtype] = natoms;
 +        snew(groups->grpnr[gtype],natoms);
 +        for(j=0; (j<natoms); j++)
 +        {
 +            groups->grpnr[gtype][j] = cbuf[j];
 +        }
 +    }
 +    
 +    sfree(cbuf);
 +
 +    return (bRest && grptp == egrptpPART);
 +}
 +
 +static void calc_nrdf(gmx_mtop_t *mtop,t_inputrec *ir,char **gnames)
 +{
 +  t_grpopts *opts;
 +  gmx_groups_t *groups;
 +  t_pull  *pull;
 +  int     natoms,ai,aj,i,j,d,g,imin,jmin,nc;
 +  t_iatom *ia;
 +  int     *nrdf2,*na_vcm,na_tot;
 +  double  *nrdf_tc,*nrdf_vcm,nrdf_uc,n_sub=0;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom  *atom;
 +  int     mb,mol,ftype,as;
 +  gmx_molblock_t *molb;
 +  gmx_moltype_t *molt;
 +
 +  /* Calculate nrdf. 
 +   * First calc 3xnr-atoms for each group
 +   * then subtract half a degree of freedom for each constraint
 +   *
 +   * Only atoms and nuclei contribute to the degrees of freedom...
 +   */
 +
 +  opts = &ir->opts;
 +  
 +  groups = &mtop->groups;
 +  natoms = mtop->natoms;
 +
 +  /* Allocate one more for a possible rest group */
 +  /* We need to sum degrees of freedom into doubles,
 +   * since floats give too low nrdf's above 3 million atoms.
 +   */
 +  snew(nrdf_tc,groups->grps[egcTC].nr+1);
 +  snew(nrdf_vcm,groups->grps[egcVCM].nr+1);
 +  snew(na_vcm,groups->grps[egcVCM].nr+1);
 +  
 +  for(i=0; i<groups->grps[egcTC].nr; i++)
 +    nrdf_tc[i] = 0;
 +  for(i=0; i<groups->grps[egcVCM].nr+1; i++)
 +    nrdf_vcm[i] = 0;
 +
 +  snew(nrdf2,natoms);
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +    nrdf2[i] = 0;
 +    if (atom->ptype == eptAtom || atom->ptype == eptNucleus) {
 +      g = ggrpnr(groups,egcFREEZE,i);
 +      /* Double count nrdf for particle i */
 +      for(d=0; d<DIM; d++) {
 +      if (opts->nFreeze[g][d] == 0) {
 +        nrdf2[i] += 2;
 +      }
 +      }
 +      nrdf_tc [ggrpnr(groups,egcTC ,i)] += 0.5*nrdf2[i];
 +      nrdf_vcm[ggrpnr(groups,egcVCM,i)] += 0.5*nrdf2[i];
 +    }
 +  }
 +
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    molt = &mtop->moltype[molb->type];
 +    atom = molt->atoms.atom;
 +    for(mol=0; mol<molb->nmol; mol++) {
 +      for (ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++) {
 +      ia = molt->ilist[ftype].iatoms;
 +      for(i=0; i<molt->ilist[ftype].nr; ) {
 +        /* Subtract degrees of freedom for the constraints,
 +         * if the particles still have degrees of freedom left.
 +         * If one of the particles is a vsite or a shell, then all
 +         * constraint motion will go there, but since they do not
 +         * contribute to the constraints the degrees of freedom do not
 +         * change.
 +         */
 +        ai = as + ia[1];
 +        aj = as + ia[2];
 +        if (((atom[ia[1]].ptype == eptNucleus) ||
 +             (atom[ia[1]].ptype == eptAtom)) &&
 +            ((atom[ia[2]].ptype == eptNucleus) ||
 +             (atom[ia[2]].ptype == eptAtom))) {
 +          if (nrdf2[ai] > 0) 
 +            jmin = 1;
 +          else
 +            jmin = 2;
 +          if (nrdf2[aj] > 0)
 +            imin = 1;
 +          else
 +            imin = 2;
 +          imin = min(imin,nrdf2[ai]);
 +          jmin = min(jmin,nrdf2[aj]);
 +          nrdf2[ai] -= imin;
 +          nrdf2[aj] -= jmin;
 +          nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +          nrdf_tc [ggrpnr(groups,egcTC ,aj)] -= 0.5*jmin;
 +          nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +          nrdf_vcm[ggrpnr(groups,egcVCM,aj)] -= 0.5*jmin;
 +        }
 +        ia += interaction_function[ftype].nratoms+1;
 +        i  += interaction_function[ftype].nratoms+1;
 +      }
 +      }
 +      ia = molt->ilist[F_SETTLE].iatoms;
 +      for(i=0; i<molt->ilist[F_SETTLE].nr; ) {
 +      /* Subtract 1 dof from every atom in the SETTLE */
 +      for(j=0; j<3; j++) {
 +      ai = as + ia[1+j];
 +        imin = min(2,nrdf2[ai]);
 +        nrdf2[ai] -= imin;
 +        nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +        nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +      }
 +      ia += 4;
 +      i  += 4;
 +      }
 +      as += molt->atoms.nr;
 +    }
 +  }
 +
 +  if (ir->ePull == epullCONSTRAINT) {
 +    /* Correct nrdf for the COM constraints.
 +     * We correct using the TC and VCM group of the first atom
 +     * in the reference and pull group. If atoms in one pull group
 +     * belong to different TC or VCM groups it is anyhow difficult
 +     * to determine the optimal nrdf assignment.
 +     */
 +    pull = ir->pull;
 +    if (pull->eGeom == epullgPOS) {
 +      nc = 0;
 +      for(i=0; i<DIM; i++) {
 +      if (pull->dim[i])
 +        nc++;
 +      }
 +    } else {
 +      nc = 1;
 +    }
 +    for(i=0; i<pull->ngrp; i++) {
 +      imin = 2*nc;
 +      if (pull->grp[0].nat > 0) {
 +      /* Subtract 1/2 dof from the reference group */
 +      ai = pull->grp[0].ind[0];
 +      if (nrdf_tc[ggrpnr(groups,egcTC,ai)] > 1) {
 +        nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5;
 +        nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5;
 +        imin--;
 +      }
 +      }
 +      /* Subtract 1/2 dof from the pulled group */
 +      ai = pull->grp[1+i].ind[0];
 +      nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +      nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +      if (nrdf_tc[ggrpnr(groups,egcTC,ai)] < 0)
 +      gmx_fatal(FARGS,"Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative",gnames[groups->grps[egcTC].nm_ind[ggrpnr(groups,egcTC,ai)]]);
 +    }
 +  }
 +  
 +  if (ir->nstcomm != 0) {
 +    /* Subtract 3 from the number of degrees of freedom in each vcm group
 +     * when com translation is removed and 6 when rotation is removed
 +     * as well.
 +     */
 +    switch (ir->comm_mode) {
 +    case ecmLINEAR:
 +      n_sub = ndof_com(ir);
 +      break;
 +    case ecmANGULAR:
 +      n_sub = 6;
 +      break;
 +    default:
 +      n_sub = 0;
 +      gmx_incons("Checking comm_mode");
 +    }
 +    
 +    for(i=0; i<groups->grps[egcTC].nr; i++) {
 +      /* Count the number of atoms of TC group i for every VCM group */
 +      for(j=0; j<groups->grps[egcVCM].nr+1; j++)
 +      na_vcm[j] = 0;
 +      na_tot = 0;
 +      for(ai=0; ai<natoms; ai++)
 +      if (ggrpnr(groups,egcTC,ai) == i) {
 +        na_vcm[ggrpnr(groups,egcVCM,ai)]++;
 +        na_tot++;
 +      }
 +      /* Correct for VCM removal according to the fraction of each VCM
 +       * group present in this TC group.
 +       */
 +      nrdf_uc = nrdf_tc[i];
 +      if (debug) {
 +      fprintf(debug,"T-group[%d] nrdf_uc = %g, n_sub = %g\n",
 +              i,nrdf_uc,n_sub);
 +      }
 +      nrdf_tc[i] = 0;
 +      for(j=0; j<groups->grps[egcVCM].nr+1; j++) {
 +      if (nrdf_vcm[j] > n_sub) {
 +        nrdf_tc[i] += nrdf_uc*((double)na_vcm[j]/(double)na_tot)*
 +          (nrdf_vcm[j] - n_sub)/nrdf_vcm[j];
 +      }
 +      if (debug) {
 +        fprintf(debug,"  nrdf_vcm[%d] = %g, nrdf = %g\n",
 +                j,nrdf_vcm[j],nrdf_tc[i]);
 +      }
 +      }
 +    }
 +  }
 +  for(i=0; (i<groups->grps[egcTC].nr); i++) {
 +    opts->nrdf[i] = nrdf_tc[i];
 +    if (opts->nrdf[i] < 0)
 +      opts->nrdf[i] = 0;
 +    fprintf(stderr,
 +          "Number of degrees of freedom in T-Coupling group %s is %.2f\n",
 +          gnames[groups->grps[egcTC].nm_ind[i]],opts->nrdf[i]);
 +  }
 +  
 +  sfree(nrdf2);
 +  sfree(nrdf_tc);
 +  sfree(nrdf_vcm);
 +  sfree(na_vcm);
 +}
 +
 +static void decode_cos(char *s,t_cosines *cosine,gmx_bool bTime)
 +{
 +  char   *t;
 +  char   format[STRLEN],f1[STRLEN];
 +  double a,phi;
 +  int    i;
 +  
 +  t=strdup(s);
 +  trim(t);
 +  
 +  cosine->n=0;
 +  cosine->a=NULL;
 +  cosine->phi=NULL;
 +  if (strlen(t)) {
 +    sscanf(t,"%d",&(cosine->n));
 +    if (cosine->n <= 0) {
 +      cosine->n=0;
 +    } else {
 +      snew(cosine->a,cosine->n);
 +      snew(cosine->phi,cosine->n);
 +      
 +      sprintf(format,"%%*d");
 +      for(i=0; (i<cosine->n); i++) {
 +      strcpy(f1,format);
 +      strcat(f1,"%lf%lf");
 +      if (sscanf(t,f1,&a,&phi) < 2)
 +        gmx_fatal(FARGS,"Invalid input for electric field shift: '%s'",t);
 +      cosine->a[i]=a;
 +      cosine->phi[i]=phi;
 +      strcat(format,"%*lf%*lf");
 +      }
 +    }
 +  }
 +  sfree(t);
 +}
 +
 +static gmx_bool do_egp_flag(t_inputrec *ir,gmx_groups_t *groups,
 +                      const char *option,const char *val,int flag)
 +{
 +  /* The maximum number of energy group pairs would be MAXPTR*(MAXPTR+1)/2.
 +   * But since this is much larger than STRLEN, such a line can not be parsed.
 +   * The real maximum is the number of names that fit in a string: STRLEN/2.
 +   */
 +#define EGP_MAX (STRLEN/2)
 +  int  nelem,i,j,k,nr;
 +  char *names[EGP_MAX];
 +  char ***gnames;
 +  gmx_bool bSet;
 +
 +  gnames = groups->grpname;
 +
 +  nelem = str_nelem(val,EGP_MAX,names);
 +  if (nelem % 2 != 0)
 +    gmx_fatal(FARGS,"The number of groups for %s is odd",option);
 +  nr = groups->grps[egcENER].nr;
 +  bSet = FALSE;
 +  for(i=0; i<nelem/2; i++) {
 +    j = 0;
 +    while ((j < nr) &&
 +         gmx_strcasecmp(names[2*i],*(gnames[groups->grps[egcENER].nm_ind[j]])))
 +      j++;
 +    if (j == nr)
 +      gmx_fatal(FARGS,"%s in %s is not an energy group\n",
 +                names[2*i],option);
 +    k = 0;
 +    while ((k < nr) &&
 +         gmx_strcasecmp(names[2*i+1],*(gnames[groups->grps[egcENER].nm_ind[k]])))
 +      k++;
 +    if (k==nr)
 +      gmx_fatal(FARGS,"%s in %s is not an energy group\n",
 +            names[2*i+1],option);
 +    if ((j < nr) && (k < nr)) {
 +      ir->opts.egp_flags[nr*j+k] |= flag;
 +      ir->opts.egp_flags[nr*k+j] |= flag;
 +      bSet = TRUE;
 +    }
 +  }
 +
 +  return bSet;
 +}
 +
 +void do_index(const char* mdparin, const char *ndx,
 +              gmx_mtop_t *mtop,
 +              gmx_bool bVerbose,
 +              t_inputrec *ir,rvec *v,
 +              warninp_t wi)
 +{
 +  t_blocka *grps;
 +  gmx_groups_t *groups;
 +  int     natoms;
 +  t_symtab *symtab;
 +  t_atoms atoms_all;
 +  char    warnbuf[STRLEN],**gnames;
 +  int     nr,ntcg,ntau_t,nref_t,nacc,nofg,nSA,nSA_points,nSA_time,nSA_temp;
 +  real    tau_min;
 +  int     nstcmin;
 +  int     nacg,nfreeze,nfrdim,nenergy,nvcm,nuser;
 +  char    *ptr1[MAXPTR],*ptr2[MAXPTR],*ptr3[MAXPTR];
 +  int     i,j,k,restnm;
 +  real    SAtime;
 +  gmx_bool    bExcl,bTable,bSetTCpar,bAnneal,bRest;
 +  int     nQMmethod,nQMbasis,nQMcharge,nQMmult,nbSH,nCASorb,nCASelec,
 +    nSAon,nSAoff,nSAsteps,nQMg,nbOPT,nbTS;
 +  char    warn_buf[STRLEN];
 +
 +  if (bVerbose)
 +    fprintf(stderr,"processing index file...\n");
 +  debug_gmx();
 +  if (ndx == NULL) {
 +    snew(grps,1);
 +    snew(grps->index,1);
 +    snew(gnames,1);
 +    atoms_all = gmx_mtop_global_atoms(mtop);
 +    analyse(&atoms_all,grps,&gnames,FALSE,TRUE);
 +    free_t_atoms(&atoms_all,FALSE);
 +  } else {
 +    grps = init_index(ndx,&gnames);
 +  }
 +
 +  groups = &mtop->groups;
 +  natoms = mtop->natoms;
 +  symtab = &mtop->symtab;
 +
 +  snew(groups->grpname,grps->nr+1);
 +  
 +  for(i=0; (i<grps->nr); i++) {
 +    groups->grpname[i] = put_symtab(symtab,gnames[i]);
 +  }
 +  groups->grpname[i] = put_symtab(symtab,"rest");
 +  restnm=i;
 +  srenew(gnames,grps->nr+1);
 +  gnames[restnm] = *(groups->grpname[i]);
 +  groups->ngrpname = grps->nr+1;
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +  ntau_t = str_nelem(tau_t,MAXPTR,ptr1);
 +  nref_t = str_nelem(ref_t,MAXPTR,ptr2);
 +  ntcg   = str_nelem(tcgrps,MAXPTR,ptr3);
 +  if ((ntau_t != ntcg) || (nref_t != ntcg)) {
 +    gmx_fatal(FARGS,"Invalid T coupling input: %d groups, %d ref-t values and "
 +                "%d tau-t values",ntcg,nref_t,ntau_t);
 +  }
 +
 +  bSetTCpar = (ir->etc || EI_SD(ir->eI) || ir->eI==eiBD || EI_TPI(ir->eI));
 +  do_numbering(natoms,groups,ntcg,ptr3,grps,gnames,egcTC,
 +               restnm,bSetTCpar ? egrptpALL : egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcTC].nr;
 +  ir->opts.ngtc = nr;
 +  snew(ir->opts.nrdf,nr);
 +  snew(ir->opts.tau_t,nr);
 +  snew(ir->opts.ref_t,nr);
 +  if (ir->eI==eiBD && ir->bd_fric==0) {
 +    fprintf(stderr,"bd-fric=0, so tau-t will be used as the inverse friction constant(s)\n");
 +  }
 +
 +  if (bSetTCpar)
 +  {
 +      if (nr != nref_t)
 +      {
 +          gmx_fatal(FARGS,"Not enough ref-t and tau-t values!");
 +      }
 +      
 +      tau_min = 1e20;
 +      for(i=0; (i<nr); i++)
 +      {
 +          ir->opts.tau_t[i] = strtod(ptr1[i],NULL);
 +          if ((ir->eI == eiBD || ir->eI == eiSD2) && ir->opts.tau_t[i] <= 0)
 +          {
 +              sprintf(warn_buf,"With integrator %s tau-t should be larger than 0",ei_names[ir->eI]);
 +              warning_error(wi,warn_buf);
 +          }
 +
 +          if (ir->etc != etcVRESCALE && ir->opts.tau_t[i] == 0)
 +          {
 +              warning_note(wi,"tau-t = -1 is the value to signal that a group should not have temperature coupling. Treating your use of tau-t = 0 as if you used -1.");
 +          }
 +
 +          if (ir->opts.tau_t[i] >= 0)
 +          {
 +              tau_min = min(tau_min,ir->opts.tau_t[i]);
 +          }
 +      }
 +      if (ir->etc != etcNO && ir->nsttcouple == -1)
 +      {
 +            ir->nsttcouple = ir_optimal_nsttcouple(ir);
 +      }
 +
 +      if (EI_VV(ir->eI)) 
 +      {
 +          if ((ir->etc==etcNOSEHOOVER) && (ir->epc==epcBERENDSEN)) {
 +              gmx_fatal(FARGS,"Cannot do Nose-Hoover temperature with Berendsen pressure control with md-vv; use either vrescale temperature with berendsen pressure or Nose-Hoover temperature with MTTK pressure");
 +          }
 +          if ((ir->epc==epcMTTK) && (ir->etc>etcNO))
 +          {
-                   mincouple = ir->nstpcouple;
++              if (ir->nstpcouple != ir->nsttcouple)
 +              {
-               ir->nstpcouple = mincouple;
-               ir->nsttcouple = mincouple;
-               sprintf(warn_buf,"for current Trotter decomposition methods with vv, nsttcouple and nstpcouple must be equal.  Both have been reset to min(nsttcouple,nstpcouple) = %d",mincouple);
-               warning_note(wi,warn_buf);
++                  int mincouple = min(ir->nstpcouple,ir->nsttcouple);
++                  ir->nstpcouple = ir->nsttcouple = mincouple;
++                  sprintf(warn_buf,"for current Trotter decomposition methods with vv, nsttcouple and nstpcouple must be equal.  Both have been reset to min(nsttcouple,nstpcouple) = %d",mincouple);
++                  warning_note(wi,warn_buf);
 +              }
 +          }
 +      }
 +      /* velocity verlet with averaged kinetic energy KE = 0.5*(v(t+1/2) - v(t-1/2)) is implemented
 +         primarily for testing purposes, and does not work with temperature coupling other than 1 */
 +
 +      if (ETC_ANDERSEN(ir->etc)) {
 +          if (ir->nsttcouple != 1) {
 +              ir->nsttcouple = 1;
 +              sprintf(warn_buf,"Andersen temperature control methods assume nsttcouple = 1; there is no need for larger nsttcouple > 1, since no global parameters are computed. nsttcouple has been reset to 1");
 +              warning_note(wi,warn_buf);
 +          }
 +      }
 +      nstcmin = tcouple_min_integration_steps(ir->etc);
 +      if (nstcmin > 1)
 +      {
 +          if (tau_min/(ir->delta_t*ir->nsttcouple) < nstcmin)
 +          {
 +              sprintf(warn_buf,"For proper integration of the %s thermostat, tau-t (%g) should be at least %d times larger than nsttcouple*dt (%g)",
 +                      ETCOUPLTYPE(ir->etc),
 +                      tau_min,nstcmin,
 +                      ir->nsttcouple*ir->delta_t);
 +              warning(wi,warn_buf);
 +          }
 +      }
 +      for(i=0; (i<nr); i++)
 +      {
 +          ir->opts.ref_t[i] = strtod(ptr2[i],NULL);
 +          if (ir->opts.ref_t[i] < 0)
 +          {
 +              gmx_fatal(FARGS,"ref-t for group %d negative",i);
 +          }
 +      }
 +      /* set the lambda mc temperature to the md integrator temperature (which should be defined
 +         if we are in this conditional) if mc_temp is negative */
 +      if (ir->expandedvals->mc_temp < 0)
 +      {
 +          ir->expandedvals->mc_temp = ir->opts.ref_t[0];  /*for now, set to the first reft */
 +      }
 +  }
 +
 +  /* Simulated annealing for each group. There are nr groups */
 +  nSA = str_nelem(anneal,MAXPTR,ptr1);
 +  if (nSA == 1 && (ptr1[0][0]=='n' || ptr1[0][0]=='N'))
 +     nSA = 0;
 +  if(nSA>0 && nSA != nr) 
 +    gmx_fatal(FARGS,"Not enough annealing values: %d (for %d groups)\n",nSA,nr);
 +  else {
 +    snew(ir->opts.annealing,nr);
 +    snew(ir->opts.anneal_npoints,nr);
 +    snew(ir->opts.anneal_time,nr);
 +    snew(ir->opts.anneal_temp,nr);
 +    for(i=0;i<nr;i++) {
 +      ir->opts.annealing[i]=eannNO;
 +      ir->opts.anneal_npoints[i]=0;
 +      ir->opts.anneal_time[i]=NULL;
 +      ir->opts.anneal_temp[i]=NULL;
 +    }
 +    if (nSA > 0) {
 +      bAnneal=FALSE;
 +      for(i=0;i<nr;i++) { 
 +      if(ptr1[i][0]=='n' || ptr1[i][0]=='N') {
 +        ir->opts.annealing[i]=eannNO;
 +      } else if(ptr1[i][0]=='s'|| ptr1[i][0]=='S') {
 +        ir->opts.annealing[i]=eannSINGLE;
 +        bAnneal=TRUE;
 +      } else if(ptr1[i][0]=='p'|| ptr1[i][0]=='P') {
 +        ir->opts.annealing[i]=eannPERIODIC;
 +        bAnneal=TRUE;
 +      } 
 +      } 
 +      if(bAnneal) {
 +      /* Read the other fields too */
 +      nSA_points = str_nelem(anneal_npoints,MAXPTR,ptr1);
 +      if(nSA_points!=nSA) 
 +          gmx_fatal(FARGS,"Found %d annealing-npoints values for %d groups\n",nSA_points,nSA);
 +      for(k=0,i=0;i<nr;i++) {
 +        ir->opts.anneal_npoints[i]=strtol(ptr1[i],NULL,10);
 +        if(ir->opts.anneal_npoints[i]==1)
 +          gmx_fatal(FARGS,"Please specify at least a start and an end point for annealing\n");
 +        snew(ir->opts.anneal_time[i],ir->opts.anneal_npoints[i]);
 +        snew(ir->opts.anneal_temp[i],ir->opts.anneal_npoints[i]);
 +        k += ir->opts.anneal_npoints[i];
 +      }
 +
 +      nSA_time = str_nelem(anneal_time,MAXPTR,ptr1);
 +      if(nSA_time!=k) 
 +          gmx_fatal(FARGS,"Found %d annealing-time values, wanter %d\n",nSA_time,k);
 +      nSA_temp = str_nelem(anneal_temp,MAXPTR,ptr2);
 +      if(nSA_temp!=k) 
 +          gmx_fatal(FARGS,"Found %d annealing-temp values, wanted %d\n",nSA_temp,k);
 +
 +      for(i=0,k=0;i<nr;i++) {
 +        
 +        for(j=0;j<ir->opts.anneal_npoints[i];j++) {
 +          ir->opts.anneal_time[i][j]=strtod(ptr1[k],NULL);
 +          ir->opts.anneal_temp[i][j]=strtod(ptr2[k],NULL);
 +          if(j==0) {
 +            if(ir->opts.anneal_time[i][0] > (ir->init_t+GMX_REAL_EPS))
 +              gmx_fatal(FARGS,"First time point for annealing > init_t.\n");      
 +          } else { 
 +            /* j>0 */
 +            if(ir->opts.anneal_time[i][j]<ir->opts.anneal_time[i][j-1])
 +              gmx_fatal(FARGS,"Annealing timepoints out of order: t=%f comes after t=%f\n",
 +                          ir->opts.anneal_time[i][j],ir->opts.anneal_time[i][j-1]);
 +          }
 +          if(ir->opts.anneal_temp[i][j]<0) 
 +            gmx_fatal(FARGS,"Found negative temperature in annealing: %f\n",ir->opts.anneal_temp[i][j]);    
 +          k++;
 +        }
 +      }
 +      /* Print out some summary information, to make sure we got it right */
 +      for(i=0,k=0;i<nr;i++) {
 +        if(ir->opts.annealing[i]!=eannNO) {
 +          j = groups->grps[egcTC].nm_ind[i];
 +          fprintf(stderr,"Simulated annealing for group %s: %s, %d timepoints\n",
 +                  *(groups->grpname[j]),eann_names[ir->opts.annealing[i]],
 +                  ir->opts.anneal_npoints[i]);
 +          fprintf(stderr,"Time (ps)   Temperature (K)\n");
 +          /* All terms except the last one */
 +          for(j=0;j<(ir->opts.anneal_npoints[i]-1);j++) 
 +              fprintf(stderr,"%9.1f      %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +          
 +          /* Finally the last one */
 +          j = ir->opts.anneal_npoints[i]-1;
 +          if(ir->opts.annealing[i]==eannSINGLE)
 +            fprintf(stderr,"%9.1f-     %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +          else {
 +            fprintf(stderr,"%9.1f      %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +            if(fabs(ir->opts.anneal_temp[i][j]-ir->opts.anneal_temp[i][0])>GMX_REAL_EPS)
 +              warning_note(wi,"There is a temperature jump when your annealing loops back.\n");
 +          }
 +        }
 +      } 
 +      }
 +    }
 +  }   
 +
 +  if (ir->ePull != epullNO) {
 +    make_pull_groups(ir->pull,pull_grp,grps,gnames);
 +  }
 +  
 +  if (ir->bRot) {
 +    make_rotation_groups(ir->rot,rot_grp,grps,gnames);
 +  }
 +
 +  nacc = str_nelem(acc,MAXPTR,ptr1);
 +  nacg = str_nelem(accgrps,MAXPTR,ptr2);
 +  if (nacg*DIM != nacc)
 +    gmx_fatal(FARGS,"Invalid Acceleration input: %d groups and %d acc. values",
 +              nacg,nacc);
 +  do_numbering(natoms,groups,nacg,ptr2,grps,gnames,egcACC,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcACC].nr;
 +  snew(ir->opts.acc,nr);
 +  ir->opts.ngacc=nr;
 +  
 +  for(i=k=0; (i<nacg); i++)
 +    for(j=0; (j<DIM); j++,k++)
 +      ir->opts.acc[i][j]=strtod(ptr1[k],NULL);
 +  for( ;(i<nr); i++)
 +    for(j=0; (j<DIM); j++)
 +      ir->opts.acc[i][j]=0;
 +  
 +  nfrdim  = str_nelem(frdim,MAXPTR,ptr1);
 +  nfreeze = str_nelem(freeze,MAXPTR,ptr2);
 +  if (nfrdim != DIM*nfreeze)
 +    gmx_fatal(FARGS,"Invalid Freezing input: %d groups and %d freeze values",
 +              nfreeze,nfrdim);
 +  do_numbering(natoms,groups,nfreeze,ptr2,grps,gnames,egcFREEZE,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcFREEZE].nr;
 +  ir->opts.ngfrz=nr;
 +  snew(ir->opts.nFreeze,nr);
 +  for(i=k=0; (i<nfreeze); i++)
 +    for(j=0; (j<DIM); j++,k++) {
 +      ir->opts.nFreeze[i][j]=(gmx_strncasecmp(ptr1[k],"Y",1)==0);
 +      if (!ir->opts.nFreeze[i][j]) {
 +      if (gmx_strncasecmp(ptr1[k],"N",1) != 0) {
 +        sprintf(warnbuf,"Please use Y(ES) or N(O) for freezedim only "
 +                "(not %s)", ptr1[k]);
 +        warning(wi,warn_buf);
 +      }
 +      }
 +    }
 +  for( ; (i<nr); i++)
 +    for(j=0; (j<DIM); j++)
 +      ir->opts.nFreeze[i][j]=0;
 +  
 +  nenergy=str_nelem(energy,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nenergy,ptr1,grps,gnames,egcENER,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  add_wall_energrps(groups,ir->nwall,symtab);
 +  ir->opts.ngener = groups->grps[egcENER].nr;
 +  nvcm=str_nelem(vcm,MAXPTR,ptr1);
 +  bRest =
 +    do_numbering(natoms,groups,nvcm,ptr1,grps,gnames,egcVCM,
 +                 restnm,nvcm==0 ? egrptpALL_GENREST : egrptpPART,bVerbose,wi);
 +  if (bRest) {
 +    warning(wi,"Some atoms are not part of any center of mass motion removal group.\n"
 +          "This may lead to artifacts.\n"
 +          "In most cases one should use one group for the whole system.");
 +  }
 +
 +  /* Now we have filled the freeze struct, so we can calculate NRDF */ 
 +  calc_nrdf(mtop,ir,gnames);
 +
 +  if (v && NULL) {
 +    real fac,ntot=0;
 +    
 +    /* Must check per group! */
 +    for(i=0; (i<ir->opts.ngtc); i++) 
 +      ntot += ir->opts.nrdf[i];
 +    if (ntot != (DIM*natoms)) {
 +      fac = sqrt(ntot/(DIM*natoms));
 +      if (bVerbose)
 +      fprintf(stderr,"Scaling velocities by a factor of %.3f to account for constraints\n"
 +              "and removal of center of mass motion\n",fac);
 +      for(i=0; (i<natoms); i++)
 +      svmul(fac,v[i],v[i]);
 +    }
 +  }
 +  
 +  nuser=str_nelem(user1,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcUser1,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nuser=str_nelem(user2,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcUser2,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nuser=str_nelem(xtc_grps,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcXTC,
 +               restnm,egrptpONE,bVerbose,wi);
 +  nofg = str_nelem(orirefitgrp,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nofg,ptr1,grps,gnames,egcORFIT,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +
 +  /* QMMM input processing */
 +  nQMg          = str_nelem(QMMM,MAXPTR,ptr1);
 +  nQMmethod     = str_nelem(QMmethod,MAXPTR,ptr2);
 +  nQMbasis      = str_nelem(QMbasis,MAXPTR,ptr3);
 +  if((nQMmethod != nQMg)||(nQMbasis != nQMg)){
 +    gmx_fatal(FARGS,"Invalid QMMM input: %d groups %d basissets"
 +            " and %d methods\n",nQMg,nQMbasis,nQMmethod);
 +  }
 +  /* group rest, if any, is always MM! */
 +  do_numbering(natoms,groups,nQMg,ptr1,grps,gnames,egcQMMM,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = nQMg; /*atoms->grps[egcQMMM].nr;*/
 +  ir->opts.ngQM = nQMg;
 +  snew(ir->opts.QMmethod,nr);
 +  snew(ir->opts.QMbasis,nr);
 +  for(i=0;i<nr;i++){
 +    /* input consists of strings: RHF CASSCF PM3 .. These need to be
 +     * converted to the corresponding enum in names.c
 +     */
 +    ir->opts.QMmethod[i] = search_QMstring(ptr2[i],eQMmethodNR,
 +                                           eQMmethod_names);
 +    ir->opts.QMbasis[i]  = search_QMstring(ptr3[i],eQMbasisNR,
 +                                           eQMbasis_names);
 +
 +  }
 +  nQMmult   = str_nelem(QMmult,MAXPTR,ptr1);
 +  nQMcharge = str_nelem(QMcharge,MAXPTR,ptr2);
 +  nbSH      = str_nelem(bSH,MAXPTR,ptr3);
 +  snew(ir->opts.QMmult,nr);
 +  snew(ir->opts.QMcharge,nr);
 +  snew(ir->opts.bSH,nr);
 +
 +  for(i=0;i<nr;i++){
 +    ir->opts.QMmult[i]   = strtol(ptr1[i],NULL,10);
 +    ir->opts.QMcharge[i] = strtol(ptr2[i],NULL,10);
 +    ir->opts.bSH[i]      = (gmx_strncasecmp(ptr3[i],"Y",1)==0);
 +  }
 +
 +  nCASelec  = str_nelem(CASelectrons,MAXPTR,ptr1);
 +  nCASorb   = str_nelem(CASorbitals,MAXPTR,ptr2);
 +  snew(ir->opts.CASelectrons,nr);
 +  snew(ir->opts.CASorbitals,nr);
 +  for(i=0;i<nr;i++){
 +    ir->opts.CASelectrons[i]= strtol(ptr1[i],NULL,10);
 +    ir->opts.CASorbitals[i] = strtol(ptr2[i],NULL,10);
 +  }
 +  /* special optimization options */
 +
 +  nbOPT = str_nelem(bOPT,MAXPTR,ptr1);
 +  nbTS = str_nelem(bTS,MAXPTR,ptr2);
 +  snew(ir->opts.bOPT,nr);
 +  snew(ir->opts.bTS,nr);
 +  for(i=0;i<nr;i++){
 +    ir->opts.bOPT[i] = (gmx_strncasecmp(ptr1[i],"Y",1)==0);
 +    ir->opts.bTS[i]  = (gmx_strncasecmp(ptr2[i],"Y",1)==0);
 +  }
 +  nSAon     = str_nelem(SAon,MAXPTR,ptr1);
 +  nSAoff    = str_nelem(SAoff,MAXPTR,ptr2);
 +  nSAsteps  = str_nelem(SAsteps,MAXPTR,ptr3);
 +  snew(ir->opts.SAon,nr);
 +  snew(ir->opts.SAoff,nr);
 +  snew(ir->opts.SAsteps,nr);
 +
 +  for(i=0;i<nr;i++){
 +    ir->opts.SAon[i]    = strtod(ptr1[i],NULL);
 +    ir->opts.SAoff[i]   = strtod(ptr2[i],NULL);
 +    ir->opts.SAsteps[i] = strtol(ptr3[i],NULL,10);
 +  }
 +  /* end of QMMM input */
 +
 +  if (bVerbose)
 +    for(i=0; (i<egcNR); i++) {
 +      fprintf(stderr,"%-16s has %d element(s):",gtypes[i],groups->grps[i].nr); 
 +      for(j=0; (j<groups->grps[i].nr); j++)
 +      fprintf(stderr," %s",*(groups->grpname[groups->grps[i].nm_ind[j]]));
 +      fprintf(stderr,"\n");
 +    }
 +
 +  nr = groups->grps[egcENER].nr;
 +  snew(ir->opts.egp_flags,nr*nr);
 +
 +  bExcl = do_egp_flag(ir,groups,"energygrp-excl",egpexcl,EGP_EXCL);
 +    if (bExcl && ir->cutoff_scheme == ecutsVERLET) 
 +    {
 +        warning_error(wi,"Energy group exclusions are not (yet) implemented for the Verlet scheme");
 +    } 
 +  if (bExcl && EEL_FULL(ir->coulombtype))
 +    warning(wi,"Can not exclude the lattice Coulomb energy between energy groups");
 +
 +  bTable = do_egp_flag(ir,groups,"energygrp-table",egptable,EGP_TABLE);
 +  if (bTable && !(ir->vdwtype == evdwUSER) && 
 +      !(ir->coulombtype == eelUSER) && !(ir->coulombtype == eelPMEUSER) &&
 +      !(ir->coulombtype == eelPMEUSERSWITCH))
 +    gmx_fatal(FARGS,"Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
 +
 +  decode_cos(efield_x,&(ir->ex[XX]),FALSE);
 +  decode_cos(efield_xt,&(ir->et[XX]),TRUE);
 +  decode_cos(efield_y,&(ir->ex[YY]),FALSE);
 +  decode_cos(efield_yt,&(ir->et[YY]),TRUE);
 +  decode_cos(efield_z,&(ir->ex[ZZ]),FALSE);
 +  decode_cos(efield_zt,&(ir->et[ZZ]),TRUE);
 +
 +  if (ir->bAdress)
 +    do_adress_index(ir->adress,groups,gnames,&(ir->opts),wi);
 +
 +  for(i=0; (i<grps->nr); i++)
 +    sfree(gnames[i]);
 +  sfree(gnames);
 +  done_blocka(grps);
 +  sfree(grps);
 +
 +}
 +
 +
 +
 +static void check_disre(gmx_mtop_t *mtop)
 +{
 +  gmx_ffparams_t *ffparams;
 +  t_functype *functype;
 +  t_iparams  *ip;
 +  int i,ndouble,ftype;
 +  int label,old_label;
 +  
 +  if (gmx_mtop_ftype_count(mtop,F_DISRES) > 0) {
 +    ffparams  = &mtop->ffparams;
 +    functype  = ffparams->functype;
 +    ip        = ffparams->iparams;
 +    ndouble   = 0;
 +    old_label = -1;
 +    for(i=0; i<ffparams->ntypes; i++) {
 +      ftype = functype[i];
 +      if (ftype == F_DISRES) {
 +      label = ip[i].disres.label;
 +      if (label == old_label) {
 +        fprintf(stderr,"Distance restraint index %d occurs twice\n",label);
 +        ndouble++;
 +      }
 +      old_label = label;
 +      }
 +    }
 +    if (ndouble>0)
 +      gmx_fatal(FARGS,"Found %d double distance restraint indices,\n"
 +              "probably the parameters for multiple pairs in one restraint "
 +              "are not identical\n",ndouble);
 +  }
 +}
 +
 +static gmx_bool absolute_reference(t_inputrec *ir,gmx_mtop_t *sys,
 +                                   gmx_bool posres_only,
 +                                   ivec AbsRef)
 +{
 +    int d,g,i;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *ilist;
 +    int nmol;
 +    t_iparams *pr;
 +
 +    clear_ivec(AbsRef);
 +
 +    if (!posres_only)
 +    {
 +        /* Check the COM */
 +        for(d=0; d<DIM; d++)
 +        {
 +            AbsRef[d] = (d < ndof_com(ir) ? 0 : 1);
 +        }
 +        /* Check for freeze groups */
 +        for(g=0; g<ir->opts.ngfrz; g++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (ir->opts.nFreeze[g][d] != 0)
 +                {
 +                    AbsRef[d] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Check for position restraints */
 +    iloop = gmx_mtop_ilistloop_init(sys);
 +    while (gmx_mtop_ilistloop_next(iloop,&ilist,&nmol))
 +    {
 +        if (nmol > 0 &&
 +            (AbsRef[XX] == 0 || AbsRef[YY] == 0 || AbsRef[ZZ] == 0))
 +        {
 +            for(i=0; i<ilist[F_POSRES].nr; i+=2)
 +            {
 +                pr = &sys->ffparams.iparams[ilist[F_POSRES].iatoms[i]];
 +                for(d=0; d<DIM; d++)
 +                {
 +                    if (pr->posres.fcA[d] != 0)
 +                    {
 +                        AbsRef[d] = 1;
 +                    }
 +                }
 +            }
 +            for(i=0; i<ilist[F_FBPOSRES].nr; i+=2)
 +            {
 +                /* Check for flat-bottom posres */
 +                pr = &sys->ffparams.iparams[ilist[F_FBPOSRES].iatoms[i]];
 +                if (pr->fbposres.k != 0)
 +                {
 +                    switch(pr->fbposres.geom)
 +                    {
 +                    case efbposresSPHERE:
 +                        AbsRef[XX] = AbsRef[YY] = AbsRef[ZZ] = 1;
 +                        break;
 +                    case efbposresCYLINDER:
 +                        AbsRef[XX] = AbsRef[YY] = 1;
 +                        break;
 +                    case efbposresX: /* d=XX */
 +                    case efbposresY: /* d=YY */
 +                    case efbposresZ: /* d=ZZ */
 +                        d = pr->fbposres.geom - efbposresX;
 +                        AbsRef[d] = 1;
 +                        break;
 +                    default:
 +                        gmx_fatal(FARGS," Invalid geometry for flat-bottom position restraint.\n"
 +                                  "Expected nr between 1 and %d. Found %d\n", efbposresNR-1,
 +                                  pr->fbposres.geom);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return (AbsRef[XX] != 0 && AbsRef[YY] != 0 && AbsRef[ZZ] != 0);
 +}
 +
 +void triple_check(const char *mdparin,t_inputrec *ir,gmx_mtop_t *sys,
 +                  warninp_t wi)
 +{
 +  char err_buf[256];
 +  int  i,m,g,nmol,npct;
 +  gmx_bool bCharge,bAcc;
 +  real gdt_max,*mgrp,mt;
 +  rvec acc;
 +  gmx_mtop_atomloop_block_t aloopb;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom *atom;
 +  ivec AbsRef;
 +  char warn_buf[STRLEN];
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +  if (EI_DYNAMICS(ir->eI) && !EI_SD(ir->eI) && ir->eI != eiBD &&
 +      ir->comm_mode == ecmNO &&
 +      !(absolute_reference(ir,sys,FALSE,AbsRef) || ir->nsteps <= 10)) {
 +    warning(wi,"You are not using center of mass motion removal (mdp option comm-mode), numerical rounding errors can lead to build up of kinetic energy of the center of mass");
 +  }
 +
 +    /* Check for pressure coupling with absolute position restraints */
 +    if (ir->epc != epcNO && ir->refcoord_scaling == erscNO)
 +    {
 +        absolute_reference(ir,sys,TRUE,AbsRef);
 +        {
 +            for(m=0; m<DIM; m++)
 +            {
 +                if (AbsRef[m] && norm2(ir->compress[m]) > 0)
 +                {
 +                    warning(wi,"You are using pressure coupling with absolute position restraints, this will give artifacts. Use the refcoord_scaling option.");
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +
 +  bCharge = FALSE;
 +  aloopb = gmx_mtop_atomloop_block_init(sys);
 +  while (gmx_mtop_atomloop_block_next(aloopb,&atom,&nmol)) {
 +    if (atom->q != 0 || atom->qB != 0) {
 +      bCharge = TRUE;
 +    }
 +  }
 +  
 +  if (!bCharge) {
 +    if (EEL_FULL(ir->coulombtype)) {
 +      sprintf(err_buf,
 +            "You are using full electrostatics treatment %s for a system without charges.\n"
 +            "This costs a lot of performance for just processing zeros, consider using %s instead.\n",
 +            EELTYPE(ir->coulombtype),EELTYPE(eelCUT));
 +      warning(wi,err_buf);
 +    }
 +  } else {
 +    if (ir->coulombtype == eelCUT && ir->rcoulomb > 0 && !ir->implicit_solvent) {
 +      sprintf(err_buf,
 +            "You are using a plain Coulomb cut-off, which might produce artifacts.\n"
 +            "You might want to consider using %s electrostatics.\n",
 +            EELTYPE(eelPME));
 +      warning_note(wi,err_buf);
 +    }
 +  }
 +
 +  /* Generalized reaction field */  
 +  if (ir->opts.ngtc == 0) {
 +    sprintf(err_buf,"No temperature coupling while using coulombtype %s",
 +          eel_names[eelGRF]);
 +    CHECK(ir->coulombtype == eelGRF);
 +  }
 +  else {
 +    sprintf(err_buf,"When using coulombtype = %s"
 +          " ref-t for temperature coupling should be > 0",
 +          eel_names[eelGRF]);
 +    CHECK((ir->coulombtype == eelGRF) && (ir->opts.ref_t[0] <= 0));
 +  }
 +
 +    if (ir->eI == eiSD1 &&
 +        (gmx_mtop_ftype_count(sys,F_CONSTR) > 0 ||
 +         gmx_mtop_ftype_count(sys,F_SETTLE) > 0))
 +    {
 +        sprintf(warn_buf,"With constraints integrator %s is less accurate, consider using %s instead",ei_names[ir->eI],ei_names[eiSD2]);
 +        warning_note(wi,warn_buf);
 +    }
 +    
 +  bAcc = FALSE;
 +  for(i=0; (i<sys->groups.grps[egcACC].nr); i++) {
 +    for(m=0; (m<DIM); m++) {
 +      if (fabs(ir->opts.acc[i][m]) > 1e-6) {
 +      bAcc = TRUE;
 +      }
 +    }
 +  }
 +  if (bAcc) {
 +    clear_rvec(acc);
 +    snew(mgrp,sys->groups.grps[egcACC].nr);
 +    aloop = gmx_mtop_atomloop_all_init(sys);
 +    while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +      mgrp[ggrpnr(&sys->groups,egcACC,i)] += atom->m;
 +    }
 +    mt = 0.0;
 +    for(i=0; (i<sys->groups.grps[egcACC].nr); i++) {
 +      for(m=0; (m<DIM); m++)
 +      acc[m] += ir->opts.acc[i][m]*mgrp[i];
 +      mt += mgrp[i];
 +    }
 +    for(m=0; (m<DIM); m++) {
 +      if (fabs(acc[m]) > 1e-6) {
 +      const char *dim[DIM] = { "X", "Y", "Z" };
 +      fprintf(stderr,
 +              "Net Acceleration in %s direction, will %s be corrected\n",
 +              dim[m],ir->nstcomm != 0 ? "" : "not");
 +      if (ir->nstcomm != 0 && m < ndof_com(ir)) {
 +        acc[m] /= mt;
 +        for (i=0; (i<sys->groups.grps[egcACC].nr); i++)
 +          ir->opts.acc[i][m] -= acc[m];
 +      }
 +      }
 +    }
 +    sfree(mgrp);
 +  }
 +
 +  if (ir->efep != efepNO && ir->fepvals->sc_alpha != 0 &&
 +      !gmx_within_tol(sys->ffparams.reppow,12.0,10*GMX_DOUBLE_EPS)) {
 +    gmx_fatal(FARGS,"Soft-core interactions are only supported with VdW repulsion power 12");
 +  }
 +
 +  if (ir->ePull != epullNO) {
 +    if (ir->pull->grp[0].nat == 0) {
 +        absolute_reference(ir,sys,FALSE,AbsRef);
 +      for(m=0; m<DIM; m++) {
 +      if (ir->pull->dim[m] && !AbsRef[m]) {
 +        warning(wi,"You are using an absolute reference for pulling, but the rest of the system does not have an absolute reference. This will lead to artifacts.");
 +        break;
 +      }
 +      }
 +    }
 +
 +    if (ir->pull->eGeom == epullgDIRPBC) {
 +      for(i=0; i<3; i++) {
 +      for(m=0; m<=i; m++) {
 +        if ((ir->epc != epcNO && ir->compress[i][m] != 0) ||
 +            ir->deform[i][m] != 0) {
 +          for(g=1; g<ir->pull->ngrp; g++) {
 +            if (ir->pull->grp[g].vec[m] != 0) {
 +              gmx_fatal(FARGS,"Can not have dynamic box while using pull geometry '%s' (dim %c)",EPULLGEOM(ir->pull->eGeom),'x'+m);
 +            }
 +          }
 +        }
 +      }
 +      }
 +    }
 +  }
 +
 +  check_disre(sys);
 +}
 +
 +void double_check(t_inputrec *ir,matrix box,gmx_bool bConstr,warninp_t wi)
 +{
 +  real min_size;
 +  gmx_bool bTWIN;
 +  char warn_buf[STRLEN];
 +  const char *ptr;
 +  
 +  ptr = check_box(ir->ePBC,box);
 +  if (ptr) {
 +      warning_error(wi,ptr);
 +  }  
 +
 +  if (bConstr && ir->eConstrAlg == econtSHAKE) {
 +    if (ir->shake_tol <= 0.0) {
 +      sprintf(warn_buf,"ERROR: shake-tol must be > 0 instead of %g\n",
 +              ir->shake_tol);
 +      warning_error(wi,warn_buf);
 +    }
 +
 +    if (IR_TWINRANGE(*ir) && ir->nstlist > 1) {
 +      sprintf(warn_buf,"With twin-range cut-off's and SHAKE the virial and the pressure are incorrect.");
 +      if (ir->epc == epcNO) {
 +      warning(wi,warn_buf);
 +      } else {
 +          warning_error(wi,warn_buf);
 +      }
 +    }
 +  }
 +
 +  if( (ir->eConstrAlg == econtLINCS) && bConstr) {
 +    /* If we have Lincs constraints: */
 +    if(ir->eI==eiMD && ir->etc==etcNO &&
 +       ir->eConstrAlg==econtLINCS && ir->nLincsIter==1) {
 +      sprintf(warn_buf,"For energy conservation with LINCS, lincs_iter should be 2 or larger.\n");
 +      warning_note(wi,warn_buf);
 +    }
 +    
 +    if ((ir->eI == eiCG || ir->eI == eiLBFGS) && (ir->nProjOrder<8)) {
 +      sprintf(warn_buf,"For accurate %s with LINCS constraints, lincs-order should be 8 or more.",ei_names[ir->eI]);
 +      warning_note(wi,warn_buf);
 +    }
 +    if (ir->epc==epcMTTK) {
 +        warning_error(wi,"MTTK not compatible with lincs -- use shake instead.");
 +    }
 +  }
 +
 +  if (ir->LincsWarnAngle > 90.0) {
 +    sprintf(warn_buf,"lincs-warnangle can not be larger than 90 degrees, setting it to 90.\n");
 +    warning(wi,warn_buf);
 +    ir->LincsWarnAngle = 90.0;
 +  }
 +
 +  if (ir->ePBC != epbcNONE) {
 +    if (ir->nstlist == 0) {
 +      warning(wi,"With nstlist=0 atoms are only put into the box at step 0, therefore drifting atoms might cause the simulation to crash.");
 +    }
 +    bTWIN = (ir->rlistlong > ir->rlist);
 +    if (ir->ns_type == ensGRID) {
 +      if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC,box)) {
 +          sprintf(warn_buf,"ERROR: The cut-off length is longer than half the shortest box vector or longer than the smallest box diagonal element. Increase the box size or decrease %s.\n",
 +              bTWIN ? (ir->rcoulomb==ir->rlistlong ? "rcoulomb" : "rvdw"):"rlist");
 +          warning_error(wi,warn_buf);
 +      }
 +    } else {
 +      min_size = min(box[XX][XX],min(box[YY][YY],box[ZZ][ZZ]));
 +      if (2*ir->rlistlong >= min_size) {
 +          sprintf(warn_buf,"ERROR: One of the box lengths is smaller than twice the cut-off length. Increase the box size or decrease rlist.");
 +          warning_error(wi,warn_buf);
 +      if (TRICLINIC(box))
 +        fprintf(stderr,"Grid search might allow larger cut-off's than simple search with triclinic boxes.");
 +      }
 +    }
 +  }
 +}
 +
 +void check_chargegroup_radii(const gmx_mtop_t *mtop,const t_inputrec *ir,
 +                             rvec *x,
 +                             warninp_t wi)
 +{
 +    real rvdw1,rvdw2,rcoul1,rcoul2;
 +    char warn_buf[STRLEN];
 +
 +    calc_chargegroup_radii(mtop,x,&rvdw1,&rvdw2,&rcoul1,&rcoul2);
 +
 +    if (rvdw1 > 0)
 +    {
 +        printf("Largest charge group radii for Van der Waals: %5.3f, %5.3f nm\n",
 +               rvdw1,rvdw2);
 +    }
 +    if (rcoul1 > 0)
 +    {
 +        printf("Largest charge group radii for Coulomb:       %5.3f, %5.3f nm\n",
 +               rcoul1,rcoul2);
 +    }
 +
 +    if (ir->rlist > 0)
 +    {
 +        if (rvdw1  + rvdw2  > ir->rlist ||
 +            rcoul1 + rcoul2 > ir->rlist)
 +        {
 +            sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than rlist (%f)\n",max(rvdw1+rvdw2,rcoul1+rcoul2),ir->rlist);
 +            warning(wi,warn_buf);
 +        }
 +        else
 +        {
 +            /* Here we do not use the zero at cut-off macro,
 +             * since user defined interactions might purposely
 +             * not be zero at the cut-off.
 +             */
 +            if (EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) &&
 +                rvdw1 + rvdw2 > ir->rlist - ir->rvdw)
 +            {
 +                sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than rlist (%f) - rvdw (%f)\n",
 +                        rvdw1+rvdw2,
 +                        ir->rlist,ir->rvdw);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi,warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi,warn_buf);
 +                }
 +            }
 +            if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) &&
 +                rcoul1 + rcoul2 > ir->rlistlong - ir->rcoulomb)
 +            {
 +                sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f)\n",
 +                        rcoul1+rcoul2,
 +                        ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
 +                        ir->rlistlong,ir->rcoulomb);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi,warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi,warn_buf);
 +                }
 +            }
 +        }
 +    }
 +}
index 9648f68e653b3fbfcee2171b48ec5962ce33754d,0000000000000000000000000000000000000000..646c7d1830b8bce42b83f9582aafbf33e6a96908
mode 100644,000000..100644
--- /dev/null
@@@ -1,74 -1,0 +1,76 @@@
-                       t_inputrec *ir,const t_commrec *cr,gmx_bool bPartDecomp,
-                       t_fcdata *fcd,t_state *state);
 +
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _disre_h
 +#define _disre_h
 +
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +void init_disres(FILE *fplog,const gmx_mtop_t *mtop,
++                 t_inputrec *ir,const t_commrec *cr,gmx_bool bPartDecomp,
++                 t_fcdata *fcd,t_state *state, gmx_bool bIsREMD);
 +/* Initiate *fcd data, must be called once, nbonds is the number 
 + * of iatoms in the ilist of the idef struct.
 + * When time averaging is used, the history is initialized in state,
 + * unless it was read before from a checkpoint file.
++ * The implementation of distance restraints with -multi
++ * must differ according to whether REMD is active.
 + */
 +
 +void calc_disres_R_6(const gmx_multisim_t *ms,
 +                          int nfa,const t_iatom *fa,const t_iparams ip[],
 +                          const rvec *x,const t_pbc *pbc,
 +                          t_fcdata *fcd,history_t *hist);
 +/* Calculates r and r^-3 (inst. and time averaged) for all pairs
 + * and the ensemble averaged r^-6 (inst. and time averaged) for all restraints
 + */
 +
 +t_ifunc ta_disres;
 +/* Calculate the distance restraint forces, return the potential */
 +
 +void update_disres_history(t_fcdata *fcd,history_t *hist);
 +/* Copy the new time averages that have been calculated in calc_disres_R_6 */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _disre_h */
index 6e785fe840b923811366d1a2b2c03b5246edc5c2,0000000000000000000000000000000000000000..87ff33ea657f2e1432d0e88c356e6c6f91d4f6d4
mode 100644,000000..100644
--- /dev/null
@@@ -1,297 -1,0 +1,304 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifndef _domdec_h
 +#define _domdec_h
 +
 +#include "typedefs.h"
 +#include "types/commrec.h"
 +#include "vsite.h"
 +#include "genborn.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +int ddglatnr(gmx_domdec_t *dd,int i);
 +/* Returns the global topology atom number belonging to local atom index i.
 + * This function is intended for writing ascii output
 + * and returns atom numbers starting at 1.
 + * When dd=NULL returns i+1.
 + */
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd);
 +/* Return a block struct for the charge groups of the whole system */
 +
 +gmx_bool dd_filled_nsgrid_home(gmx_domdec_t *dd);
 +/* Is the ns grid already filled with the home particles? */
 +
 +void dd_store_state(gmx_domdec_t *dd,t_state *state);
 +/* Store the global cg indices of the home cgs in state,
 + * so it can be reset, even after a new DD partitioning.
 + */
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd);
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 +                             int *jcg0,int *jcg1,ivec shift0,ivec shift1);
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd);
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd,
 +                                  int *at_start,int *at_end);
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd);
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd);
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr,int nodeid);
 +/* Return if nodeid in cr->mpi_comm_mysim is a PME-only node */
 +
 +void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
 +                          int *nmy_ddnodes,int **my_ddnodes,int *node_peer);
 +/* Returns the set of DD nodes that communicate with pme node cr->nodeid */
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd);
 +/* Returns the maximum shift for coordinate communication in PME, dim x */
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd);
 +/* Returns the maximum shift for coordinate communication in PME, dim y */
 +
 +void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order);
 +
 +gmx_domdec_t *
 +init_domain_decomposition(FILE *fplog,
 +                          t_commrec *cr,
 +                          unsigned long Flags,
 +                          ivec nc,
 +                          real comm_distance_min,real rconstr,
 +                          const char *dlb_opt,real dlb_scale,
 +                          const char *sizex,const char *sizey,const char *sizez,
 +                          gmx_mtop_t *mtop,t_inputrec *ir,
 +                          matrix box,rvec *x,
 +                          gmx_ddbox_t *ddbox,
 +                          int *npme_x, int *npme_y);
 +
 +void dd_init_bondeds(FILE *fplog,
 +                            gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                            gmx_vsite_t *vsite,gmx_constr_t constr,
 +                            t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb);
 +/* Initialize data structures for bonded interactions */
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC);
 +/* Returns if we need to do pbc for calculating bonded interactions */
 +
 +void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
 +                              t_inputrec *ir,t_forcerec *fr,
 +                              gmx_ddbox_t *ddbox);
 +/* Set DD grid dimensions and limits,
 + * should be called after calling dd_init_bondeds.
 + */
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
 +                          real cutoff_req );
 +/* Change the DD non-bonded communication cut-off.
 + * This could fail when trying to increase the cut-off,
 + * then FALSE will be returned and the cut-off is not modified.
 + */
 +
++void change_dd_dlb_cutoff_limit(t_commrec *cr);
++/* Domain boundary changes due to the DD dynamic load balancing can limit
++ * the cut-off distance that can be set in change_dd_cutoff. This function
++ * limits the DLB such that using the currently set cut-off should still be
++ * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
++ */
++
 +void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd);
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                           t_state *state_local,rvec *lv,rvec *v);
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                             t_state *state_local,t_state *state);
 +
 +enum { ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr };
 +
 +void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl);
 +/* Add the wallcycle count to the DD counter */
 +
 +void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb);
 +/* Start the force flop count */
 +
 +void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb);
 +/* Stop the force flop count */
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd);
 +/* Return the PME/PP force load ratio, or -1 if nothing was measured.
 + * Should only be called on the DD master node.
 + */
 +
 +void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[]);
 +/* Communicate the coordinates to the neighboring cells and do pbc. */
 +
 +void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift);
 +/* Sum the forces over the neighboring cells.
 + * When fshift!=NULL the shift forces are updated to obtain
 + * the correct virial from the single sum including f.
 + */
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd,real v[]);
 +/* Communicate a real for each atom to the neighboring cells. */
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd,real v[]);
 +/* Sum the contributions to a real for each atom over the neighboring cells. */
 +
 +void dd_partition_system(FILE            *fplog,
 +                                gmx_large_int_t      step,
 +                                t_commrec       *cr,
 +                                gmx_bool            bMasterState,
 +                                int             nstglobalcomm,
 +                                t_state         *state_global,
 +                                gmx_mtop_t      *top_global,
 +                                t_inputrec      *ir,
 +                                t_state         *state_local,
 +                                rvec            **f,
 +                                t_mdatoms       *mdatoms,
 +                                gmx_localtop_t  *top_local,
 +                                t_forcerec      *fr,
 +                                gmx_vsite_t     *vsite,
 +                                gmx_shellfc_t   shellfc,
 +                                gmx_constr_t    constr,
 +                                t_nrnb          *nrnb,
 +                                gmx_wallcycle_t wcycle,
 +                                gmx_bool            bVerbose);
 +/* Partition the system over the nodes.
 + * step is only used for printing error messages.
 + * If bMasterState==TRUE then state_global from the master node is used,
 + * else state_local is redistributed between the nodes.
 + * When f!=NULL, *f will be reallocated to the size of state_local.
 + */
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd);
 +/* Reset all the statistics and counters for total run counting */
 +
 +void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog);
 +
 +/* In domdec_con.c */
 +
 +void dd_move_f_vsites(gmx_domdec_t *dd,rvec *f,rvec *fshift);
 +
 +void dd_clear_f_vsites(gmx_domdec_t *dd,rvec *f);
 +
 +void dd_move_x_constraints(gmx_domdec_t *dd,matrix box,
 +                                rvec *x0,rvec *x1);
 +/* Move x0 and also x1 if x1!=NULL */
 +
 +void dd_move_x_vsites(gmx_domdec_t *dd,matrix box,rvec *x);
 +
 +int *dd_constraints_nlocalatoms(gmx_domdec_t *dd);
 +
 +void dd_clear_local_constraint_indices(gmx_domdec_t *dd);
 +
 +void dd_clear_local_vsite_indices(gmx_domdec_t *dd);
 +
 +int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil);
 +
 +int dd_make_local_constraints(gmx_domdec_t *dd,int at_start,
 +                              const gmx_mtop_t *mtop,
 +                              const int *cginfo,
 +                              gmx_constr_t constr,int nrec,
 +                              t_ilist *il_local);
 +
 +void init_domdec_constraints(gmx_domdec_t *dd,
 +                             gmx_mtop_t *mtop,
 +                             gmx_constr_t constr);
 +
 +void init_domdec_vsites(gmx_domdec_t *dd,int n_intercg_vsite);
 +
 +
 +/* In domdec_top.c */
 +
 +void dd_print_missing_interactions(FILE *fplog,t_commrec *cr,
 +                                          int local_count,  gmx_mtop_t *top_global, t_state *state_local);
 +
 +void dd_make_reverse_top(FILE *fplog,
 +                                gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                                gmx_vsite_t *vsite,gmx_constr_t constr,
 +                                t_inputrec *ir,gmx_bool bBCheck);
 +
 +void dd_make_local_cgs(gmx_domdec_t *dd,t_block *lcgs);
 +
 +void dd_make_local_top(FILE *fplog,
 +                       gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                       int npbcdim,matrix box,
 +                       rvec cellsize_min,ivec npulse,
 +                       t_forcerec *fr,
 +                       rvec *cgcm_or_x,
 +                       gmx_vsite_t *vsite,
 +                       gmx_mtop_t *top,gmx_localtop_t *ltop);
 +
 +void dd_sort_local_top(gmx_domdec_t *dd,t_mdatoms *mdatoms,
 +                              gmx_localtop_t *ltop);
 +/* Sort ltop->ilist when we are doing free energy. */
 +
 +gmx_localtop_t *dd_init_local_top(gmx_mtop_t *top_global);
 +
 +void dd_init_local_state(gmx_domdec_t *dd,
 +                                t_state *state_global,t_state *local_state);
 +
 +t_blocka *make_charge_group_links(gmx_mtop_t *mtop,gmx_domdec_t *dd,
 +                                         cginfo_mb_t *cginfo_mb);
 +
 +void dd_bonded_cg_distance(FILE *fplog,
 +                                  gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                                  t_inputrec *ir,rvec *x,matrix box,
 +                                  gmx_bool bBCheck,
 +                                  real *r_2b,real *r_mb);
 +
 +void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
 +                         gmx_mtop_t *mtop,
 +                         t_commrec *cr,
 +                         int natoms,rvec x[],matrix box);
 +/* Dump a pdb file with the current DD home + communicated atoms.
 + * When natoms=-1, dump all known atoms.
 + */
 +
 +
 +/* In domdec_setup.c */
 +
 +real comm_box_frac(ivec dd_nc,real cutoff,gmx_ddbox_t *ddbox);
 +/* Returns the volume fraction of the system that is communicated */
 +
 +real dd_choose_grid(FILE *fplog,
 +                           t_commrec *cr,gmx_domdec_t *dd,t_inputrec *ir,
 +                           gmx_mtop_t *mtop,matrix box,gmx_ddbox_t *ddbox,
 +                           gmx_bool bDynLoadBal,real dlb_scale,
 +                           real cellsize_limit,real cutoff_dd,
 +                           gmx_bool bInterCGBondeds,gmx_bool bInterCGMultiBody);
 +/* Determines the optimal DD cell setup dd->nc and possibly npmenodes
 + * for the system.
 + * On the master node returns the actual cellsize limit used.
 + */
 +
 +
 +/* In domdec_box.c */
 +
 +void set_ddbox(gmx_domdec_t *dd,gmx_bool bMasterState,t_commrec *cr_sum,
 +                      t_inputrec *ir,matrix box,
 +                      gmx_bool bCalcUnboundedSize,t_block *cgs,rvec *x,
 +                      gmx_ddbox_t *ddbox);
 +
 +void set_ddbox_cr(t_commrec *cr,ivec *dd_nc,
 +                         t_inputrec *ir,matrix box,t_block *cgs,rvec *x,
 +                         gmx_ddbox_t *ddbox);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _domdec_h */
index bbd6e612cbdc3b63dfc8bcefbbc523d71afc266c,0000000000000000000000000000000000000000..b256573aabd99502b58f1c070f2493a165d8a735
mode 100644,000000..100644
--- /dev/null
@@@ -1,68 -1,0 +1,69 @@@
- void gmx_omp_nthreads_read_env(int *nthreads_omp);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef GMX_OMP_NTHREADS
 +#define GMX_OMP_NTHREADS
 +
 +#include "types/commrec.h"
 +
 +
 +/*! Enum values corresponding to multithreaded algorithmic modules. */
 +typedef enum module_nth
 +{
 +    /* Default is meant to be used in OMP regions outside the named
 +     * algorithmic modules listed below. */
 +    emntDefault, emntDomdec, emntPairsearch, emntNonbonded,
 +    emntBonded, emntPME,  emntUpdate, emntVSITE, emntLINCS, emntSETTLE,
 +    emntNR
 +} module_nth_t;
 +
 +/*! Initializes the per-module thread count. It is compatible with tMPI, 
 + *  thread-safety is ensured (for the features available with tMPI). 
 + *  This function should caled only once during the initialization of mdrun. */
 +void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 +                           int nthreads_hw_avail,
 +                           int omp_nthreads_req,
 +                           int omp_nthreads_pme_req,
 +                           gmx_bool bCurrNodePMEOnly,
 +                           gmx_bool bFullOmpSupport);
 +
 +/*! Returns the number of threads to be used in the given module m. */
 +int gmx_omp_nthreads_get(int mod);
 +
 +/*! Read the OMP_NUM_THREADS env. var. and check against the value set on the command line. */
++void gmx_omp_nthreads_read_env(int *nthreads_omp,
++                               gmx_bool bIsSimMaster);
 +
 +#endif /* GMX_OMP_NTHREADS */
index 068e34960f7707ccdefe05822e59001f8315afa2,0000000000000000000000000000000000000000..b2c96c46fe2de2f2068755964464abf90c9d60dc
mode 100644,000000..100644
--- /dev/null
@@@ -1,105 -1,0 +1,102 @@@
- FUNC_QUALIFIER
- gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name) FUNC_TERM_INT
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _GPU_UTILS_H_
 +#define _GPU_UTILS_H_
 +
 +#include "types/simple.h"
 +#include "types/hw_info.h"
 +
 +#ifdef GMX_GPU
 +#define FUNC_TERM_INT ;
 +#define FUNC_TERM_VOID ;
 +#define FUNC_QUALIFIER
 +#else
 +#define FUNC_TERM_INT {return -1;}
 +#define FUNC_TERM_VOID {}
 +#define FUNC_QUALIFIER static
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +FUNC_QUALIFIER
 +int do_quick_memtest(int dev_id) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +int do_full_memtest(int dev_id) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +int do_timed_memtest(int dev_id, int time_limit) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
 +
 +FUNC_QUALIFIER
 +gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 +                                const int *requested_devs, int count) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +void free_gpu_info(const gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
 +
 +FUNC_QUALIFIER
 +gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +gmx_bool free_gpu(char *result_str) FUNC_TERM_INT
 +
 +/*! \brief Returns the device ID of the GPU currently in use.*/
 +FUNC_QUALIFIER
 +int get_current_gpu_device_id(void) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_INT
 +
 +FUNC_QUALIFIER
 +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_VOID
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#undef FUNC_TERM_INT
 +#undef FUNC_TERM_VOID
 +#undef FUNC_QUALIFIER
 +
 +#endif /* _GPU_UTILS_H_ */
index 8b9ed28563c02c7a8febea5cb18d41738c221ef0,0000000000000000000000000000000000000000..5b73b885d3d91533811fb7aea155a613005b08b1
mode 100644,000000..100644
--- /dev/null
@@@ -1,97 -1,0 +1,101 @@@
-                           int val,const char *name);
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _main_h
 +#define _main_h
 +
 +
 +#include <stdio.h>
 +#include "network.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +char *gmx_gethostname(char *name, size_t len);
 +/* Sets the hostname to the value given by gethostname, if available,
 + * and to "unknown" otherwise. name should have at least size len.
 + * Returns name.
 + */
 +
 +void gmx_log_open(const char *fn,const t_commrec *cr,
 +                          gmx_bool bMasterOnly, gmx_bool bAppendFiles, FILE**);
 +/* Open the log file, if necessary (nprocs > 1) the logfile name is
 + * communicated around the ring.
 + */
 +
 +void gmx_log_close(FILE *fp);
 +/* Close the log file */
 +
 +void check_multi_int(FILE *log,const gmx_multisim_t *ms,
-                            gmx_large_int_t val,const char *name);
++                     int val,const char *name,
++                     gmx_bool bQuiet);
 +void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
-  * if the val's don't match.
++                           gmx_large_int_t val,const char *name,
++                           gmx_bool bQuiet);
 +/* Check if val is the same on all processors for a mdrun -multi run
 + * The string name is used to print to the log file and in a fatal error
++ * if the val's don't match. If bQuiet is true and the check passes,
++ * no output is written.
 + */
++
 +void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
 +                      int nfile, const t_filenm fnm[], gmx_bool bParFn);
 +/* Splits the communication into nsim separate simulations
 + * and creates a communication structure between the master
 + * these simulations.
 + * If bParFn is set, the nodeid is appended to the tpx and each output file.
 + */
 +
 +t_commrec *init_par(int *argc,char ***argv_ptr);
 +/* Initiate the parallel computer. Return the communication record
 + * (see network.h). The command line arguments are communicated so that they can be
 + * parsed on each processor.
 + * Arguments are the number of command line arguments, and a pointer to the
 + * array of argument strings. Both are allowed to be NULL.
 + */
 +
 +t_commrec *init_par_threads(const t_commrec *cro);
 +/* Initialize communication records for thread-parallel simulations. 
 +   Must be called on all threads before any communication takes place by 
 +   the individual threads. Copies the original commrec to 
 +   thread-local versions (a small memory leak results because we don't 
 +   deallocate the old shared version).  */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _main_h */
index 3471fd8f644107b0045b639746fc8c34cb863cfb,0000000000000000000000000000000000000000..8f9661d98888b963c7795076af6f7d241da3d37c
mode 100644,000000..100644
--- /dev/null
@@@ -1,157 -1,0 +1,173 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _mdebin_h
 +#define _mdebin_h
 +
 +#include "typedefs.h"
 +#include "sysstuff.h"
 +#include "ebin.h"
 +#include "enxio.h"
 +#include "types/state.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +/* The functions & data structures here determine the content for outputting  
 +   the .edr file; the file format and actual writing is done with functions
 +   defined in enxio.h */
 +
 +/* forward declaration */
 +typedef struct t_mde_delta_h_coll t_mde_delta_h_coll;
 +
++
 +/* This is the collection of energy averages collected during mdrun, and to 
 +   be written out to the .edr file. */
 +typedef struct {
 +  double delta_t;
 +  t_ebin *ebin;
 +  int    ie,iconrmsd,ib,ivol,idens,ipv,ienthalpy;
 +  int    isvir,ifvir,ipres,ivir,isurft,ipc,itemp,itc,itcb,iu,imu;
 +  int    ivcos,ivisc;
 +  int    nE,nEg,nEc,nTC,nTCP,nU,nNHC;
 +  int    *igrp;
 +  char   **grpnms;
 +  int    mde_n,mdeb_n;
 +  real   *tmp_r;
 +  rvec   *tmp_v;
 +  gmx_bool   bConstr;
 +  gmx_bool   bConstrVir;
 +  gmx_bool   bTricl;
 +  gmx_bool   bDynBox;
 +  gmx_bool   bNHC_trotter;
 +  gmx_bool   bPrintNHChains;
 +  gmx_bool   bMTTK;
 +  gmx_bool   bMu; /* true if dipole is calculated */
 +  gmx_bool   bDiagPres;
 +  gmx_bool   bVir;
 +  gmx_bool   bPress;
 +  gmx_bool   bSurft;
 +  int    f_nre;
 +  int    epc;
 +  real   ref_p;
 +  int  etc;
 +  int    nCrmsd;
 +  gmx_bool   bEner[F_NRE];
 +  gmx_bool   bEInd[egNR];
 +  char   **print_grpnms;
 +
 +  FILE   *fp_dhdl; /* the dhdl.xvg output file */
++  double *dE; /* energy components for dhdl.xvg output */
 +  t_mde_delta_h_coll *dhc; /* the delta U components (raw data + histogram) */
 +  real *temperatures;
 +} t_mdebin;
 +
++
++/* delta_h block type enum: the kinds of energies written out. */
++enum
++{
++    dhbtDH=0,    /* delta H BAR energy difference*/
++    dhbtDHDL=1,  /* dH/dlambda derivative */
++    dhbtEN,      /* System energy */
++    dhbtPV,      /* pV term */
++    dhbtEXPANDED, /* expanded ensemble statistics */
++    dhbtNR
++};
++
++
++
 +t_mdebin *init_mdebin(ener_file_t fp_ene,
 +                             const gmx_mtop_t *mtop,
 +                             const t_inputrec *ir,
 +                             FILE *fp_dhdl);
 +/* Initiate MD energy bin and write header to energy file. */
 +
 +FILE *open_dhdl(const char *filename,const t_inputrec *ir,
 +                     const output_env_t oenv);
 +/* Open the dhdl file for output */
 +
 +/* update the averaging structures. Called every time 
 +   the energies are evaluated. */
 +void upd_mdebin(t_mdebin *md, 
 +        gmx_bool bDoDHDL,
 +        gmx_bool bSum,
 +        double time,
 +        real tmass,
 +        gmx_enerdata_t *enerd,
 +        t_state *state,
 +        t_lambda *fep,
 +        t_expanded *expand,
 +        matrix  lastbox,
 +        tensor svir,
 +        tensor fvir,
 +        tensor vir,
 +        tensor pres,
 +        gmx_ekindata_t *ekind,
 +        rvec mu_tot,
 +        gmx_constr_t constr);
 +
 +void upd_mdebin_step(t_mdebin *md);
 +/* Updates only the step count in md */
 +  
 +void print_ebin_header(FILE *log,gmx_large_int_t steps,double time,real lamb);
 +
 +void print_ebin(ener_file_t fp_ene,gmx_bool bEne,gmx_bool bDR,gmx_bool bOR,
 +                     FILE *log,
 +                     gmx_large_int_t step,double time,
 +                     int mode,gmx_bool bCompact,
 +                     t_mdebin *md,t_fcdata *fcd,
 +                     gmx_groups_t *groups,t_grpopts *opts);
 +
 +
 +
 +/* Between .edr writes, the averages are history dependent,
 +   and that history needs to be retained in checkpoints. 
 +   These functions set/read the energyhistory_t structure
 +   that is written to checkpoints in checkpoint.c */
 +
 +/* Set the energyhistory_t data structure from a mdebin structure */
 +void update_energyhistory(energyhistory_t * enerhist,t_mdebin * mdebin);
 +
 +/* Read the energyhistory_t data structure to a mdebin structure*/
 +void restore_energyhistory_from_state(t_mdebin * mdebin,
 +                                             energyhistory_t * enerhist);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _mdebin_h */
 +
index 5914c054d2bdc0d40f2a56d7dca0018ec93db522,0000000000000000000000000000000000000000..3c802d57f311bb1bf7cad063f2f64f45784a11ee
mode 100644,000000..100644
--- /dev/null
@@@ -1,209 -1,0 +1,206 @@@
- gmx_integrator_t do_md_openmm;
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _mdrun_h
 +#define _mdrun_h
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include "typedefs.h"
 +#include "network.h"
 +#include "sim_util.h"
 +#include "tgroup.h"
 +#include "filenm.h"
 +#include "mshift.h"
 +#include "force.h"
 +#include "edsam.h"
 +#include "mdebin.h"
 +#include "vcm.h"
 +#include "vsite.h"
 +#include "pull.h"
 +#include "update.h"
 +#include "types/membedt.h"
 +#include "types/globsig.h"
 +
 +
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi/threads.h"
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#define MD_POLARISE       (1<<2)
 +#define MD_IONIZE         (1<<3)
 +#define MD_RERUN          (1<<4)
 +#define MD_RERUN_VSITE    (1<<5)
 +#define MD_FFSCAN         (1<<6)
 +#define MD_SEPPOT         (1<<7)
 +#define MD_PARTDEC        (1<<9)
 +#define MD_DDBONDCHECK    (1<<10)
 +#define MD_DDBONDCOMM     (1<<11)
 +#define MD_CONFOUT        (1<<12)
 +#define MD_REPRODUCIBLE   (1<<13)
 +#define MD_READ_RNG       (1<<14)
 +#define MD_APPENDFILES    (1<<15)
 +#define MD_APPENDFILESSET (1<<21)
 +#define MD_KEEPANDNUMCPT  (1<<16)
 +#define MD_READ_EKIN      (1<<17)
 +#define MD_STARTFROMCPT   (1<<18)
 +#define MD_RESETCOUNTERSHALFWAY (1<<19)
 +#define MD_TUNEPME        (1<<20)
 +#define MD_TESTVERLET     (1<<22)
 +
 +enum {
 +  ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
 +};
 +
 +typedef struct {
 +    int      nthreads_tot;        /* Total number of threads requested (TMPI) */
 +    int      nthreads_tmpi;       /* Number of TMPI threads requested         */
 +    int      nthreads_omp;        /* Number of OpenMP threads requested       */
 +    int      nthreads_omp_pme;    /* As nthreads_omp, but for PME only nodes  */
 +    gmx_bool bThreadPinning;      /* Pin OpenMP threads to cores?             */
 +    gmx_bool bPinHyperthreading;  /* Pin pairs of threads to physical cores   */
 +    int      core_pinning_offset; /* Physical core pinning offset             */
 +    char    *gpu_id;              /* GPU id's to use, each specified as chars */
 +} gmx_hw_opt_t;
 +
 +/* Variables for temporary use with the deform option,
 + * used in runner.c and md.c.
 + * (These variables should be stored in the tpx file.)
 + */
 +extern gmx_large_int_t     deform_init_init_step_tpx;
 +extern matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +extern tMPI_Thread_mutex_t deform_init_box_mutex;
 +
 +/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
 + * the number of threads will get lowered.
 + */
 +#define MIN_ATOMS_PER_MPI_THREAD    90
 +#define MIN_ATOMS_PER_GPU           900
 +#endif
 +
 +
 +typedef double gmx_integrator_t(FILE *log,t_commrec *cr,
 +                              int nfile,const t_filenm fnm[],
 +                              const output_env_t oenv, gmx_bool bVerbose,
 +                                gmx_bool bCompact, int nstglobalcomm,
 +                              gmx_vsite_t *vsite,gmx_constr_t constr,
 +                              int stepout,
 +                              t_inputrec *inputrec,
 +                              gmx_mtop_t *mtop,t_fcdata *fcd,
 +                              t_state *state,
 +                              t_mdatoms *mdatoms,
 +                              t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                              gmx_edsam_t ed, 
 +                              t_forcerec *fr,
 +                              int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                                gmx_membed_t membed,
 +                              real cpt_period,real max_hours,
 +                              const char *deviceOptions,
 +                              unsigned long Flags,
 +                              gmx_runtime_t *runtime);
 +
 +/* ROUTINES from md.c */
 +
 +gmx_integrator_t do_md;
 +
 +
 +/* ROUTINES from minimize.c */
 +
 +gmx_integrator_t do_steep;
 +/* Do steepest descents EM */
 +
 +gmx_integrator_t do_cg;
 +/* Do conjugate gradient EM */
 +
 +gmx_integrator_t do_lbfgs;
 +/* Do conjugate gradient L-BFGS */
 +
 +gmx_integrator_t do_nm;
 +/* Do normal mode analysis */
 +
 +/* ROUTINES from tpi.c */
 +
 +gmx_integrator_t do_tpi;
 +/* Do test particle insertion */
 +
 +void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
 +
 +int ExpandedEnsembleDynamics(FILE *log,t_inputrec *ir, gmx_enerdata_t *enerd,
 +                             t_state *state, t_extmass *MassQ, df_history_t *dfhist,
 +                             gmx_large_int_t step, gmx_rng_t mcrng,
 +                             rvec *v, t_mdatoms *mdatoms);
 +
 +void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
 +                               int nlam, int frequency, gmx_large_int_t step);
 +
 +void get_mc_state(gmx_rng_t rng,t_state *state);
 +
 +void set_mc_state(gmx_rng_t rng,t_state *state);
 +
 +/* check the version */
 +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
 +                               t_inputrec *ir,gmx_mtop_t *mtop);
 +
 +/* Allocate and initialize node-local state entries. */
 +void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes);
 +
 +/* Broadcast the data for a simulation, and allocate node-specific settings
 +   such as rng generators. */
 +void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
 +                          gmx_mtop_t *mtop);
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +           FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order,
 +             real rdd, real rconstr, const char *dddlb_opt,real dlb_scale,
 +           const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +           const char *nbpu_opt,
 +           int nsteps_cmdline, int nstepout, int resetstep,
 +           int nmultisim, int repl_ex_nst, int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +           const char *deviceOptions, unsigned long Flags);
 +/* Driver routine, that calls the different methods */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _mdrun_h */
index 7b2e56aa2641f22fa2273c874b06d8a25f8c83b4,0000000000000000000000000000000000000000..8ed17680aefc986dccb74a290d0b2c318d5873fd
mode 100644,000000..100644
--- /dev/null
@@@ -1,156 -1,0 +1,157 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _names_h
 +#define _names_h
 +
 +
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +/* All string arrays are NULL terminated, and therefore have an
 + * extra argument (the +1)
 + * these should correspond to names.c and include/types/enums.h
 + */
 +extern const char *epbc_names[epbcNR+1];
 +extern const char *etcoupl_names[etcNR+1];
 +extern const char *epcoupl_names[epcNR+1];
 +extern const char *epcoupltype_names[epctNR+1];
 +extern const char *erefscaling_names[erscNR+1];
 +extern const char *ecutscheme_names[ecutsNR+1];
 +extern const char *ens_names[ensNR+1];
 +extern const char *ei_names[eiNR+1];
 +extern const char *yesno_names[BOOL_NR+1];
 +extern const char *bool_names[BOOL_NR+1];
 +extern const char *eintmod_names[eintmodNR+1];
 +extern const char *eel_names[eelNR+1];
 +extern const char *eewg_names[eewgNR+1];
 +extern const char *evdw_names[evdwNR+1];
 +extern const char *econstr_names[econtNR+1];
 +extern const char *ptype_str[eptNR+1];
 +extern const char *egrp_nm[egNR+1];
 +extern const char *edisre_names[edrNR+1];
 +extern const char *edisreweighting_names[edrwNR+1];
 +extern const char *enbf_names[eNBF_NR+1];
 +extern const char *ecomb_names[eCOMB_NR+1];
 +extern const char *gtypes[egcNR+1];
 +extern const char *esimtemp_names[esimtempNR+1];
 +extern const char *efep_names[efepNR+1];
 +extern const char *efpt_names[efptNR+1];
++extern const char *efpt_singular_names[efptNR+1];
 +extern const char *elamstats_names[elamstatsNR+1];
 +extern const char *elmcmove_names[elmcmoveNR+1];
 +extern const char *elmceq_names[elmceqNR+1];
 +extern const char *separate_dhdl_file_names[esepdhdlfileNR+1];
 +extern const char *dhdl_derivatives_names[edhdlderivativesNR+1];
 +extern const char *esol_names[esolNR+1];
 +extern const char *edispc_names[edispcNR+1];
 +extern const char *ecm_names[ecmNR+1];
 +extern const char *eann_names[eannNR+1];
 +extern const char *egb_names[egbNR+1];
 +extern const char *eis_names[eisNR+1];
 +extern const char *esa_names[esaNR+1];
 +extern const char *ewt_names[ewtNR+1];
 +extern const char *epull_names[epullNR+1];
 +extern const char *epullg_names[epullgNR+1];
 +extern const char *erotg_names[erotgNR+1];
 +extern const char *erotg_originnames[erotgNR+1];
 +extern const char *erotg_fitnames[erotgFitNR+1];
 +extern const char *eQMmethod_names[eQMmethodNR+1];
 +extern const char *eQMbasis_names[eQMbasisNR+1];
 +extern const char *eQMMMscheme_names[eQMMMschemeNR+1];
 +extern const char *eMultentOpt_names[eMultentOptNR+1];
 +extern const char *eAdresstype_names[eAdressNR+1];
 +extern const char *eAdressICtype_names[eAdressICNR+1];
 +extern const char *eAdressSITEtype_names[eAdressSITENR+1];
 +extern const char *gmx_nblist_geometry_names[GMX_NBLIST_GEOMETRY_NR+1];
 +extern const char *gmx_nbkernel_elec_names[GMX_NBKERNEL_ELEC_NR+1];
 +extern const char *gmx_nbkernel_vdw_names[GMX_NBKERNEL_VDW_NR+1];
 +
 +#define       UNDEFINED               "UNDEFINED"
 +#define ENUM_NAME(e,max,names)        ((((e)<0)||((e)>=(max)))?UNDEFINED:(names)[e])
 +
 +#define EBOOL(e)       ENUM_NAME(e,BOOL_NR,bool_names)
 +#define ECUTSCHEME(e)  ENUM_NAME(e,ecutsNR,ecutscheme_names)
 +#define ENS(e)         ENUM_NAME(e,ensNR,ens_names)
 +#define EI(e)          ENUM_NAME(e,eiNR,ei_names)
 +#define EPBC(e)        ENUM_NAME(e,epbcNR,epbc_names)
 +#define ETCOUPLTYPE(e) ENUM_NAME(e,etcNR,etcoupl_names)
 +#define EPCOUPLTYPE(e) ENUM_NAME(e,epcNR,epcoupl_names)
 +#define EPCOUPLTYPETYPE(e) ENUM_NAME(e,epctNR,epcoupltype_names)
 +#define EREFSCALINGTYPE(e) ENUM_NAME(e,erscNR,erefscaling_names)
 +#define EBLOCKS(e)     ENUM_NAME(e,ebNR,eblock_names)
 +#define EPARAM(e)      ENUM_NAME(e,epNR,eparam_names)
 +#define INTMODIFIER(e) ENUM_NAME(e,eintmodNR,eintmod_names)
 +#define EELTYPE(e)     ENUM_NAME(e,eelNR,eel_names)
 +#define EVDWTYPE(e)    ENUM_NAME(e,evdwNR,evdw_names)
 +#define ECONSTRTYPE(e) ENUM_NAME(e,econtNR,econstr_names)
 +#define EDISRETYPE(e)  ENUM_NAME(e,edrNR,edisre_names)
 +#define EDISREWEIGHTING(e)  ENUM_NAME(e,edrwNR,edisreweighting_names)
 +#define ENBFNAME(e)    ENUM_NAME(e,eNBF_NR,enbf_names)
 +#define ECOMBNAME(e)   ENUM_NAME(e,eCOMB_NR,ecomb_names)
 +#define ESIMTEMP(e)    ENUM_NAME(e,esimtempNR,esimtemp_names)
 +#define EFEPTYPE(e)    ENUM_NAME(e,efepNR,efep_names)
 +#define SEPDHDLFILETYPE(e) ENUM_NAME(e,esepdhdlfileNR,separate_dhdl_file_names)
 +#define DHDLDERIVATIVESTYPE(e) ENUM_NAME(e,edhdlderivativesNR,dhdl_derivatives_names)
 +#define ESOLTYPE(e)    ENUM_NAME(e,esolNR,esol_names)
 +#define ENLISTTYPE(e)  ENUM_NAME(e,enlistNR,enlist_names)
 +#define EDISPCORR(e)   ENUM_NAME(e,edispcNR,edispc_names)
 +#define ECOM(e)        ENUM_NAME(e,ecmNR,ecm_names)
 +#define EANNEAL(e)      ENUM_NAME(e,eannNR,eann_names)
 +#define EGBALGORITHM(e) ENUM_NAME(e,egbNR,egb_names)
 +#define ESAALGORITHM(e) ENUM_NAME(e,esaNR,esa_names)
 +#define EIMPLICITSOL(e) ENUM_NAME(e,eisNR,eis_names)
 +#define EWALLTYPE(e)   ENUM_NAME(e,ewtNR,ewt_names)
 +#define EPULLTYPE(e)   ENUM_NAME(e,epullNR,epull_names)
 +#define EPULLGEOM(e)   ENUM_NAME(e,epullgNR,epullg_names)
 +#define EROTGEOM(e)    ENUM_NAME(e,erotgNR,erotg_names)
 +#define EROTORIGIN(e)  ENUM_NAME(e,erotgOriginNR,erotg_originnames)
 +#define EROTFIT(e)     ENUM_NAME(e,erotgFitNR,erotg_fitnames)
 +#define EQMMETHOD(e)   ENUM_NAME(e,eQMmethodNR,eQMmethod_names)
 +#define EQMBASIS(e)    ENUM_NAME(e,eQMbasisNR,eQMbasis_names)
 +#define EQMMMSCHEME(e) ENUM_NAME(e,eQMMMschemeNR,eQMMMscheme_names)
 +#define EMULTENTOPT(e) ENUM_NAME(e,eMultentOptNR,eMultentOpt_names)
 +#define EADRESSTYPE(e) ENUM_NAME(e,eAdressNR,eAdresstype_names)
 +#define EADRESSICTYPE(e) ENUM_NAME(e,eAdressICNR,eAdressICtype_names)
 +#define EADRESSSITETYPE(e) ENUM_NAME(e,eAdressSITENR,eAdressSITEtype_names)
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _names_h */
index 179b669beea0fb51e17ba7633e31b53ddf47dc92,0000000000000000000000000000000000000000..d450be34e84ab4c852be2c25019acabfd5bc8a8c
mode 100644,000000..100644
--- /dev/null
@@@ -1,83 -1,0 +1,85 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-  
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef HWINFO_H
 +#define HWINFO_H
 +
 +#include "simple.h"
 +#include "nbnxn_cuda_types_ext.h"
 +#include "../gmx_cpuid.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +} /* fixes auto-indentation problems */
 +#endif
 +
 +/* Possible results of the GPU detection/check.
 + *
 + * The egpuInsane value means that during the sanity checks an error
 + * occurred that indicates malfunctioning of the device, driver, or
 + * incompatible driver/runtime. */
 +typedef enum
 +{
 +    egpuCompatible = 0,  egpuNonexistent,  egpuIncompatible, egpuInsane
 +} e_gpu_detect_res_t;
 +
 +/* Textual names of the GPU detection/check results (see e_gpu_detect_res_t). */
 +static const char * const gpu_detect_res_str[] =
 +{
 +    "compatible", "inexistent", "incompatible", "insane"
 +};
 +
 +/* GPU device information -- for now with only CUDA devices.
 + * The gmx_hardware_detect module initializes it. */
 +typedef struct 
 +{
 +    gmx_bool            bUserSet;       /* true if the GPUs in cuda_dev_use are manually provided by the user */
++    gmx_bool            bDevShare;      /* true if any of the devices is shared by
++                                           (t)MPI ranks, with auto-detection always FALSE */
 +
 +    int                 ncuda_dev_use;  /* number of devices selected to be used */
 +    int                 *cuda_dev_use;  /* index of the devices selected to be used */
 +    int                 ncuda_dev;      /* total number of devices detected */
 +    cuda_dev_info_ptr_t cuda_dev;       /* devices detected in the system (per node) */
 +} gmx_gpu_info_t;
 +
 +/* Hardware information structure with CPU and GPU information.
 + * It is initialized by gmx_detect_hardware(). */
 +typedef struct
 +{
 +    gmx_bool        bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
 +    gmx_gpu_info_t  gpu_info;   /* Information about GPUs detected in the system */
 +
 +    gmx_cpuid_t     cpuid_info; /* CPUID information about CPU detected;
 +                                   NOTE: this will only detect the CPU thread 0 of the
 +                                   current process runs on. */
 +    int             nthreads_hw_avail; /* Number of hardware threads available; this number
 +                                         is based on the number of CPUs reported as available
 +                                         by the OS at the time of detection. */
 +} gmx_hw_info_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif /* HWINFO_H */
index a3d9c6fd7fd7c327eb62a2810ec4ee3124c6bddb,0000000000000000000000000000000000000000..0a05ddff51ed165967b93ab141e01d2a00440c78
mode 100644,000000..100644
--- /dev/null
@@@ -1,435 -1,0 +1,443 @@@
-   double init_lambda;    /* fractional value of lambda (usually will use init_fep_state, this will only be for slow growth, and for legacy free energy code)   */
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +#ifndef _inputrec_h_
 +#define _inputrec_h_
 +
 +
 +#include "simple.h"
 +#include "../sysstuff.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +typedef struct {
 +  int  n;             /* Number of terms                              */
 +  real *a;            /* Coeffients (V / nm )                         */
 +  real *phi;          /* Phase angles                                 */
 +} t_cosines;
 +
 +typedef struct {
 +  real E0;              /* Field strength (V/nm)                        */
 +  real omega;           /* Frequency (1/ps)                             */
 +  real t0;              /* Centre of the Gaussian pulse (ps)            */
 +  real sigma;           /* Width of the Gaussian pulse (FWHM) (ps)      */
 +} t_efield;
 +
 +#define EGP_EXCL  (1<<0)
 +#define EGP_TABLE (1<<1)
 +
 +typedef struct {
 +  int     ngtc;                  /* # T-Coupl groups                        */
 +  int     nhchainlength;         /* # of nose-hoover chains per group       */
 +  int     ngacc;                 /* # Accelerate groups                     */
 +  int     ngfrz;                 /* # Freeze groups                         */
 +  int     ngener;              /* # Ener groups                           */
 +  real    *nrdf;               /* Nr of degrees of freedom in a group     */
 +  real    *ref_t;              /* Coupling temperature        per group   */
 +  int     *annealing;            /* No/simple/periodic SA for each group    */
 +  int     *anneal_npoints;       /* Number of annealing time points per grp */    
 +  real    **anneal_time;         /* For ea. group: Time points              */
 +  real    **anneal_temp;         /* For ea. grp: Temperature at these times */
 +                                 /* Final temp after all intervals is ref_t */ 
 +  real    *tau_t;              /* Tau coupling time                       */
 +  rvec    *acc;                        /* Acceleration per group                  */
 +  ivec    *nFreeze;            /* Freeze the group in each direction ?    */
 +  int     *egp_flags;            /* Exclusions/tables of energy group pairs */
 +
 +  /* QMMM stuff */
 +  int     ngQM;         /* nr of QM groups                              */
 +  int     *QMmethod;    /* Level of theory in the QM calculation        */
 +  int     *QMbasis;     /* Basisset in the QM calculation               */
 +  int     *QMcharge;    /* Total charge in the QM region                */
 +  int     *QMmult;      /* Spin multiplicicty in the QM region          */
 +  gmx_bool    *bSH;         /* surface hopping (diabatic hop only)          */
 +  int     *CASorbitals; /* number of orbiatls in the active space       */
 +  int     *CASelectrons;/* number of electrons in the active space      */
 +  real    *SAon;        /* at which gap (A.U.) the SA is switched on    */
 +  real    *SAoff;
 +  int     *SAsteps;     /* in how many steps SA goes from 1-1 to 0.5-0.5*/
 +  gmx_bool    *bOPT;
 +  gmx_bool    *bTS;
 +} t_grpopts;
 +
 +enum { epgrppbcNONE, epgrppbcREFAT, epgrppbcCOS };
 +
 +typedef struct {
 +  int        nat;      /* Number of atoms in the pull group */
 +  atom_id    *ind;     /* The global atoms numbers */
 +  int        nat_loc;  /* Number of local pull atoms */
 +  int        nalloc_loc; /* Allocation size for ind_loc and weight_loc */ 
 +  atom_id    *ind_loc; /* Local pull indices */
 +  int        nweight;  /* The number of weights (0 or nat) */
 +  real       *weight;  /* Weights (use all 1 when weight==NULL) */
 +  real       *weight_loc; /* Weights for the local indices */
 +  int        epgrppbc; /* The type of pbc for this pull group, see enum above */
 +  atom_id    pbcatom;  /* The reference atom for pbc (global number) */
 +  rvec       vec;      /* The pull vector, direction or position */
 +  rvec       init;     /* Initial reference displacement */
 +  real       rate;     /* Rate of motion (nm/ps) */
 +  real       k;        /* force constant */
 +  real       kB;       /* force constant for state B */
 +  real       wscale;   /* scaling factor for the weights: sum w m/sum w w m */
 +  real       invtm;    /* inverse total mass of the group: 1/wscale sum w m */
 +  dvec       x;        /* center of mass before update */
 +  dvec       xp;       /* center of mass after update before constraining */
 +  dvec       dr;       /* The distance from the reference group */
 +  double     f_scal;   /* Scalar force for directional pulling */
 +  dvec       f;        /* force due to the pulling/constraining */
 +} t_pullgrp; 
 +
 +typedef struct {
 +  int  eSimTempScale;   /* simulated temperature scaling; linear or exponential */
 +  real simtemp_low;     /* the low temperature for simulated tempering  */
 +  real simtemp_high;    /* the high temperature for simulated tempering */
 +  real *temperatures;   /* the range of temperatures used for simulated tempering */
 +} t_simtemp;
 +
 +typedef struct {
 +  int  nstdhdl;          /* The frequency for calculating dhdl           */
++  double init_lambda;    /* fractional value of lambda (usually will use
++                            init_fep_state, this will only be for slow growth,
++                            and for legacy free energy code. Only has a
++                            valid value if positive)   */
 +  int init_fep_state;    /* the initial number of the state                   */
 +  double delta_lambda;         /* change of lambda per time step (fraction of (0.1) */
 +  gmx_bool bPrintEnergy; /* Whether to print the energy in the dhdl           */
 +  int  n_lambda;         /* The number of foreign lambda points               */
 +  double **all_lambda;   /* The array of all lambda values                    */
++  int lambda_neighbors;  /* The number of neighboring lambda states to
++                            calculate the energy for in up and down directions
++                            (-1 for all) */
++  int lambda_start_n;    /* The first lambda to calculate energies for */
++  int lambda_stop_n;     /* The last lambda +1 to calculate energies for */
 +  real sc_alpha;         /* free energy soft-core parameter                   */
 +  int  sc_power;         /* lambda power for soft-core interactions           */
 +  real sc_r_power;          /* r power for soft-core interactions                */
 +  real sc_sigma;         /* free energy soft-core sigma when c6 or c12=0      */
 +  real sc_sigma_min;     /* free energy soft-core sigma for ?????             */
 +  gmx_bool bScCoul;      /* use softcore for the coulomb portion as well (default FALSE) */
 +  gmx_bool separate_dvdl[efptNR]; /* whether to print the dvdl term associated with
 +                                     this term; if it is not specified as separate,
 +                                     it is lumped with the FEP term */
 +  int separate_dhdl_file;    /* whether to write a separate dhdl.xvg file
 +                                note: NOT a gmx_bool, but an enum */
 +  int  dhdl_derivatives;     /* whether to calculate+write dhdl derivatives
 +                                note: NOT a gmx_bool, but an enum */
 +  int dh_hist_size;         /* The maximum table size for the dH histogram */
 +  double dh_hist_spacing;   /* The spacing for the dH histogram */
 +} t_lambda;
 +
 +typedef struct {
 +  int nstexpanded;           /* The frequency of expanded ensemble state changes */
 +  int elamstats;             /* which type of move updating do we use for lambda monte carlo (or no for none) */
 +  int elmcmove;              /* what move set will be we using for state space moves */
 +  int elmceq;                /* the method we use to decide of we have equilibrated the weights */
 +  int equil_n_at_lam;        /* the minumum number of samples at each lambda for deciding whether we have reached a minimum */
 +  real equil_wl_delta;       /* WL delta at which we stop equilibrating weights */
 +  real equil_ratio;          /* use the ratio of weights (ratio of minimum to maximum) to decide when to stop equilibrating */
 +  int equil_steps;           /* after equil_steps steps we stop equilibrating the weights */
 +  int equil_samples;         /* after equil_samples total samples (steps/nstfep), we stop equilibrating the weights */
 +  int lmc_seed;               /* random number seed for lambda mc switches */
 +  gmx_bool minvar;               /* whether to use minumum variance weighting */
 +  int minvarmin;             /* the number of samples needed before kicking into minvar routine */
 +  real minvar_const;         /* the offset for the variance in MinVar */
 +  int c_range;               /* range of cvalues used for BAR */
 +  gmx_bool bSymmetrizedTMatrix;  /* whether to print symmetrized matrices */
 +  int  nstTij;                /* How frequently to print the transition matrices */
 +  int  lmc_repeats;          /* number of repetitions in the MC lambda jumps */  /*MRS -- VERIFY THIS */
 +  int  lmc_forced_nstart;    /* minimum number of samples for each state before free sampling */ /* MRS -- VERIFY THIS! */
 +  int  gibbsdeltalam;        /* distance in lambda space for the gibbs interval */
 +  real  wl_scale;            /* scaling factor for wang-landau */
 +  real  wl_ratio;            /* ratio between largest and smallest number for freezing the weights */
 +  real  init_wl_delta;       /* starting delta for wang-landau */
 +  gmx_bool bWLoneovert;      /* use one over t convergece for wang-landau when the delta get sufficiently small */
 +  gmx_bool bInit_weights;    /* did we initialize the weights? */
 +  real mc_temp;              /* To override the main temperature, or define it if it's not defined */
 +  real *init_lambda_weights; /* user-specified initial weights to start with  */
 +} t_expanded;
 +
 +typedef struct {
 +  int        ngrp;        /* number of groups */
 +  int        eGeom;       /* pull geometry */
 +  ivec       dim;         /* used to select components for constraint */
 +  real       cyl_r1;      /* radius of cylinder for dynamic COM */
 +  real       cyl_r0;      /* radius of cylinder including switch length */
 +  real       constr_tol;  /* absolute tolerance for constraints in (nm) */
 +  int        nstxout;     /* Output frequency for pull x */
 +  int        nstfout;     /* Output frequency for pull f */
 +  int        ePBC;        /* the boundary conditions */
 +  int        npbcdim;     /* do pbc in dims 0 <= dim < npbcdim */
 +  gmx_bool       bRefAt;      /* do we need reference atoms for a group COM ? */
 +  int        cosdim;      /* dimension for cosine weighting, -1 if none */
 +  gmx_bool       bVirial;     /* do we need to add the pull virial? */
 +  t_pullgrp  *grp;        /* groups to pull/restrain/etc/ */
 +  t_pullgrp  *dyna;       /* dynamic groups for use with local constraints */
 +  rvec       *rbuf;       /* COM calculation buffer */
 +  dvec       *dbuf;       /* COM calculation buffer */
 +  double     *dbuf_cyl;   /* cylinder ref. groups COM calculation buffer */
 +
 +  FILE       *out_x;      /* output file for pull data */
 +  FILE       *out_f;      /* output file for pull data */
 +} t_pull;
 +
 +
 +/* Abstract types for enforced rotation only defined in pull_rotation.c       */
 +typedef struct gmx_enfrot *gmx_enfrot_t;
 +typedef struct gmx_enfrotgrp *gmx_enfrotgrp_t;
 +
 +typedef struct {
 +  int        eType;          /* Rotation type for this group                  */
 +  int        bMassW;         /* Use mass-weighed positions?                   */
 +  int        nat;            /* Number of atoms in the group                  */
 +  atom_id    *ind;           /* The global atoms numbers                      */
 +  rvec       *x_ref;         /* The reference positions                       */
 +  rvec       vec;            /* The normalized rotation vector                */
 +  real       rate;           /* Rate of rotation (degree/ps)                  */
 +  real       k;              /* Force constant (kJ/(mol nm^2)                 */
 +  rvec       pivot;          /* Pivot point of rotation axis (nm)             */
 +  int        eFittype;       /* Type of fit to determine actual group angle   */
 +  int        PotAngle_nstep; /* Number of angles around the reference angle
 +                                for which the rotation potential is also
 +                                evaluated (for fit type 'potential' only)     */
 +  real       PotAngle_step;  /* Distance between two angles in degrees (for
 +                                fit type 'potential' only)                    */
 +  real       slab_dist;      /* Slab distance (nm)                            */
 +  real       min_gaussian;   /* Minimum value the gaussian must have so that 
 +                                the force is actually evaluated               */
 +  real       eps;            /* Additive constant for radial motion2 and
 +                                flexible2 potentials (nm^2)                   */
 +  gmx_enfrotgrp_t enfrotgrp; /* Stores non-inputrec rotation data per group   */
 +} t_rotgrp;
 +
 +typedef struct {
 +  int        ngrp;           /* Number of rotation groups                     */
 +  int        nstrout;        /* Output frequency for main rotation outfile    */
 +  int        nstsout;        /* Output frequency for per-slab data            */
 +  t_rotgrp   *grp;           /* Groups to rotate                              */
 +  gmx_enfrot_t enfrot;       /* Stores non-inputrec enforced rotation data    */
 +} t_rot;
 +
 +
 +typedef struct {
 +  int  type;     /* type of AdResS simulation                    */
 +  gmx_bool bnew_wf;  /* enable new AdResS weighting function         */
 +  gmx_bool bchempot_dx;  /*true:interaction table format input is F=-dmu/dx   false: dmu_dwp  */
 +  gmx_bool btf_full_box; /* true: appy therm force everywhere in the box according to table false: only in hybrid region */
 +  real const_wf; /* value of weighting function for eAdressConst */
 +  real ex_width; /* center of the explicit zone                  */
 +  real hy_width; /* width of the hybrid zone                     */
 +  int  icor;     /* type of interface correction                 */
 +  int  site;     /* AdResS CG site location                      */
 +  rvec refs;     /* Coordinates for AdResS reference             */
 +  real ex_forcecap; /* in the hybrid zone, cap forces large then this to adress_ex_forcecap */
 +  gmx_bool do_hybridpairs; /* If true pair interaction forces are also scaled in an adress way*/
 +
 +  int * tf_table_index; /* contains mapping of energy group index -> i-th adress tf table*/
 +  int n_tf_grps;
 +  int *group_explicit;
 +  int   n_energy_grps;
 +} t_adress;
 +
 +typedef struct {
 +  int  eI;              /* Integration method                                 */
 +  gmx_large_int_t nsteps;     /* number of steps to be taken                  */
 +  int  simulation_part; /* Used in checkpointing to separate chunks */
 +  gmx_large_int_t init_step;  /* start at a stepcount >0 (used w. tpbconv)    */
 +  int  nstcalcenergy; /* frequency of energy calc. and T/P coupl. upd.        */
 +  int  cutoff_scheme;   /* group or verlet cutoffs     */
 +  int  ns_type;               /* which ns method should we use?               */
 +  int  nstlist;               /* number of steps before pairlist is generated */
 +  int  ndelta;                /* number of cells per rlong                    */
 +  int  nstcomm;               /* number of steps after which center of mass   */
 +                        /* motion is removed                          */
 +  int  comm_mode;       /* Center of mass motion removal algorithm      */
 +  int nstcheckpoint;    /* checkpointing frequency                      */
 +  int nstlog;         /* number of steps after which print to logfile */
 +  int nstxout;                /* number of steps after which X is output      */
 +  int nstvout;                /* id. for V                                    */
 +  int nstfout;                /* id. for F                                    */
 +  int nstenergy;      /* number of steps after which energies printed */
 +  int nstxtcout;      /* id. for compressed trj (.xtc)                */
 +  double init_t;      /* initial time (ps)                            */
 +  double delta_t;     /* time step (ps)                               */
 +  real xtcprec;         /* precision of xtc file                        */
 +  real fourier_spacing; /* requested fourier_spacing, when nk? not set  */
 +  int  nkx,nky,nkz;     /* number of k vectors in each spatial dimension*/
 +                        /* for fourier methods for long range electrost.*/
 +  int  pme_order;       /* interpolation order for PME                  */
 +  real ewald_rtol;      /* Real space tolerance for Ewald, determines   */
 +                        /* the real/reciprocal space relative weight    */
 +  int  ewald_geometry;  /* normal/3d ewald, or pseudo-2d LR corrections */
 +  real epsilon_surface; /* Epsilon for PME dipole correction            */
 +  gmx_bool bOptFFT;         /* optimize the fft plan at start               */
 +  int  ePBC;          /* Type of periodic boundary conditions         */
 +  int  bPeriodicMols;   /* Periodic molecules                           */
 +  gmx_bool bContinuation;   /* Continuation run: starting state is correct    */
 +  int  etc;           /* temperature coupling                         */
 +  int  nsttcouple;      /* interval in steps for temperature coupling   */
 +  gmx_bool bPrintNHChains; /* whether to print nose-hoover chains        */
 +  int  epc;           /* pressure coupling                            */
 +  int  epct;          /* pressure coupling type                       */
 +  int  nstpcouple;      /* interval in steps for pressure coupling      */
 +  real tau_p;         /* pressure coupling time (ps)                  */
 +  tensor ref_p;               /* reference pressure (kJ/(mol nm^3))           */
 +  tensor compress;    /* compressability ((mol nm^3)/kJ)              */
 +  int  refcoord_scaling;/* How to scale absolute reference coordinates  */
 +  rvec posres_com;      /* The COM of the posres atoms                  */
 +  rvec posres_comB;     /* The B-state COM of the posres atoms          */
 +  int  andersen_seed;   /* Random seed for Andersen thermostat (obsolete) */
 +  real verletbuf_drift; /* Max. drift (kJ/mol/ps/atom) for list buffer  */
 +  real rlist;             /* short range pairlist cut-off (nm)                */
 +  real rlistlong;         /* long range pairlist cut-off (nm)         */
 +  int  nstcalclr;       /* Frequency of evaluating direct space long-range interactions */
 +  real rtpi;            /* Radius for test particle insertion           */
 +  int  coulombtype;   /* Type of electrostatics treatment             */
 +  int  coulomb_modifier; /* Modify the Coulomb interaction              */
 +  real rcoulomb_switch; /* Coulomb switch range start (nm)            */
 +  real rcoulomb;        /* Coulomb cutoff (nm)                                */
 +  real epsilon_r;       /* relative dielectric constant                 */ 
 +  real epsilon_rf;      /* relative dielectric constant of the RF       */ 
 +  int  implicit_solvent;/* No (=explicit water), or GBSA solvent models */
 +  int  gb_algorithm;    /* Algorithm to use for calculation Born radii  */
 +  int  nstgbradii;      /* Frequency of updating Generalized Born radii */
 +  real rgbradii;        /* Cutoff for GB radii calculation              */
 +  real gb_saltconc;     /* Salt concentration (M) for GBSA models       */
 +  real gb_epsilon_solvent; /* dielectric coeff. of implicit solvent     */
 +  real gb_obc_alpha;    /* 1st scaling factor for Bashford-Case GB      */
 +  real gb_obc_beta;     /* 2nd scaling factor for Bashford-Case GB      */
 +  real gb_obc_gamma;    /* 3rd scaling factor for Bashford-Case GB      */
 +  real gb_dielectric_offset; /* Dielectric offset for Still/HCT/OBC     */
 +  int  sa_algorithm;    /* Algorithm for SA part of GBSA                */
 +  real sa_surface_tension; /* Energy factor for SA part of GBSA */
 +  int  vdwtype;         /* Type of Van der Waals treatment              */
 +  int  vdw_modifier;    /* Modify the VdW interaction                   */
 +  real rvdw_switch;     /* Van der Waals switch range start (nm)        */
 +  real rvdw;              /* Van der Waals cutoff (nm)                */
 +  int  eDispCorr;       /* Perform Long range dispersion corrections    */
 +  real tabext;          /* Extension of the table beyond the cut-off,   *
 +                       * as well as the table length for 1-4 interac. */
 +  real shake_tol;     /* tolerance for shake                          */
 +  int  efep;                  /* free energy calculations                     */ 
 +  t_lambda *fepvals;    /* Data for the FEP state                       */
 +  gmx_bool bSimTemp;    /* Whether to do simulated tempering            */
 +  t_simtemp *simtempvals;/* Variables for simulated tempering            */
 +  gmx_bool bExpanded;   /* Whether expanded ensembles are used          */
 +  t_expanded *expandedvals; /* Expanded ensemble parameters              */
 +  int  eDisre;          /* Type of distance restraining                 */
 +  real dr_fc;             /* force constant for ta_disre                      */
 +  int  eDisreWeighting; /* type of weighting of pairs in one restraints       */
 +  gmx_bool bDisreMixed;     /* Use comb of time averaged and instan. viol's   */
 +  int  nstdisreout;     /* frequency of writing pair distances to enx   */ 
 +  real dr_tau;                    /* time constant for memory function in disres      */
 +  real orires_fc;         /* force constant for orientational restraints  */
 +  real orires_tau;        /* time constant for memory function in orires      */
 +  int  nstorireout;     /* frequency of writing tr(SD) to enx           */ 
 +  real dihre_fc;        /* force constant for dihedral restraints (obsolete)  */
 +  real em_stepsize;       /* The stepsize for updating                        */
 +  real em_tol;                    /* The tolerance                            */
 +  int  niter;           /* Number of iterations for convergence of      */
 +                        /* steepest descent in relax_shells             */
 +  real fc_stepsize;     /* Stepsize for directional minimization        */
 +                        /* in relax_shells                              */
 +  int  nstcgsteep;      /* number of steps after which a steepest       */
 +                        /* descents step is done while doing cg         */
 +  int  nbfgscorr;       /* Number of corrections to the hessian to keep */
 +  int  eConstrAlg;      /* Type of constraint algorithm                 */
 +  int  nProjOrder;      /* Order of the LINCS Projection Algorithm      */
 +  real LincsWarnAngle;  /* If bond rotates more than %g degrees, warn   */
 +  int  nLincsIter;      /* Number of iterations in the final Lincs step */
 +  gmx_bool bShakeSOR;       /* Use successive overrelaxation for shake      */
 +  real bd_fric;         /* Friction coefficient for BD (amu/ps)         */
 +  int  ld_seed;         /* Random seed for SD and BD                    */
 +  int  nwall;           /* The number of walls                          */
 +  int  wall_type;       /* The type of walls                            */
 +  real wall_r_linpot;   /* The potentail is linear for r<=wall_r_linpot */
 +  int  wall_atomtype[2];/* The atom type for walls                      */
 +  real wall_density[2]; /* Number density for walls                     */
 +  real wall_ewald_zfac; /* Scaling factor for the box for Ewald         */
 +  int  ePull;           /* Type of pulling: no, umbrella or constraint  */
 +  t_pull *pull;         /* The data for center of mass pulling          */
 +  gmx_bool bRot;        /* Calculate enforced rotation potential(s)?    */
 +  t_rot *rot;           /* The data for enforced rotation potentials    */
 +  real cos_accel;       /* Acceleration for viscosity calculation       */
 +  tensor deform;        /* Triclinic deformation velocities (nm/ps)     */
 +  int  userint1;        /* User determined parameters                   */
 +  int  userint2;
 +  int  userint3;
 +  int  userint4;
 +  real userreal1;
 +  real userreal2;
 +  real userreal3;
 +  real userreal4;
 +  t_grpopts opts;     /* Group options                                */
 +  t_cosines ex[DIM];  /* Electric field stuff (spatial part)          */
 +  t_cosines et[DIM];  /* Electric field stuff (time part)             */
 +  gmx_bool bQMMM;           /* QM/MM calculation                            */ 
 +  int  QMconstraints;   /* constraints on QM bonds                      */
 +  int  QMMMscheme;      /* Scheme: ONIOM or normal                      */
 +  real scalefactor;     /* factor for scaling the MM charges in QM calc.*/
 +                        /* parameter needed for AdResS simulation       */
 +  gmx_bool bAdress;     /* Is AdResS enabled ? */
 +  t_adress *adress;     /* The data for adress simulations */
 +} t_inputrec;
 +
 +#define DEFORM(ir) ((ir).deform[XX][XX]!=0 || (ir).deform[YY][YY]!=0 || (ir).deform[ZZ][ZZ]!=0 || (ir).deform[YY][XX]!=0 || (ir).deform[ZZ][XX]!=0 || (ir).deform[ZZ][YY]!=0)
 +
 +#define DYNAMIC_BOX(ir) ((ir).epc!=epcNO || (ir).eI==eiTPI || DEFORM(ir))
 +
 +#define PRESERVE_SHAPE(ir) ((ir).epc != epcNO && (ir).deform[XX][XX] == 0 && ((ir).epct == epctISOTROPIC || (ir).epct == epctSEMIISOTROPIC))
 +
 +#define NEED_MUTOT(ir) (((ir).coulombtype==eelEWALD || EEL_PME((ir).coulombtype)) && ((ir).ewald_geometry==eewg3DC || (ir).epsilon_surface!=0))
 +
 +#define IR_TWINRANGE(ir) ((ir).rlist > 0 && ((ir).rlistlong == 0 || (ir).rlistlong > (ir).rlist))
 +
 +#define IR_ELEC_FIELD(ir) ((ir).ex[XX].n > 0 || (ir).ex[YY].n > 0 || (ir).ex[ZZ].n > 0)
 +
 +#define IR_EXCL_FORCES(ir) (EEL_FULL((ir).coulombtype) || (EEL_RF((ir).coulombtype) && (ir).coulombtype != eelRF_NEC) || (ir).implicit_solvent != eisNO)
 +/* use pointer definitions of ir here, since that's what's usually used in the code */
 +#define IR_NPT_TROTTER(ir) ((((ir)->eI == eiVV) || ((ir)->eI == eiVVAK)) && (((ir)->epc == epcMTTK) && ((ir)->etc == etcNOSEHOOVER)))
 +
 +#define IR_NVT_TROTTER(ir) ((((ir)->eI == eiVV) || ((ir)->eI == eiVVAK)) && ((!((ir)->epc == epcMTTK)) && ((ir)->etc == etcNOSEHOOVER)))
 +
 +#define IR_NPH_TROTTER(ir) ((((ir)->eI == eiVV) || ((ir)->eI == eiVVAK)) && (((ir)->epc == epcMTTK) && (!(((ir)->etc == etcNOSEHOOVER)))))
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif
index 501c2743ccba94262b05f78361f973e0d73b533a,0000000000000000000000000000000000000000..915b4de9fec54d48fc7f98dcbc41b35a352a83d0
mode 100644,000000..100644
--- /dev/null
@@@ -1,70 -1,0 +1,70 @@@
-     gmx_bool bIterate;
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +#ifndef _iteratedconstraints_h
 +#define _iteratedconstraints_h
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#if 0
 +}
 +/* Hack to make automatic indenting work */
 +#endif
 +
 +/* Definitions for convergence of iterated constraints */
 +
 +/* iterate constraints up to 50 times  */
 +#define MAXITERCONST       50
 +
 +/* data type */
 +typedef struct
 +{
 +    real f,fprev,x,xprev;  
 +    int iter_i;
++    gmx_bool bIterationActive;
 +    real allrelerr[MAXITERCONST+2];
 +    int num_close; /* number of "close" violations, caused by limited precision. */
 +} gmx_iterate_t;
 +
 +void gmx_iterate_init(gmx_iterate_t *iterate,gmx_bool bIterate);
 +
 +gmx_bool done_iterating(const t_commrec *cr,FILE *fplog, int nsteps, gmx_iterate_t *iterate, gmx_bool bFirstIterate, real fom, real *newf);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 932a732377b50a25cce47dedd6798b047e094dd9,0000000000000000000000000000000000000000..a867ba96b1506bedf68bd0cfe2c285cc7d2ab87f
mode 100644,000000..100644
--- /dev/null
@@@ -1,360 -1,0 +1,359 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Green Red Orange Magenta Azure Cyan Skyblue
 + */
 +#include "eigensolver.h"
 +
 +#include "gromacs/legacyheaders/types/simple.h"
 +#include "gromacs/legacyheaders/gmx_fatal.h"
 +#include "gromacs/legacyheaders/smalloc.h"
 +
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +#include "gmx_lapack.h"
 +#include "gmx_arpack.h"
 +
 +void
 +eigensolver(real *   a,
 +            int      n,
 +            int      index_lower,
 +            int      index_upper,
 +            real *   eigenvalues,
 +            real *   eigenvectors)
 +{
 +    int *   isuppz;
 +    int     lwork,liwork;
 +    int     il,iu,m,iw0,info;
 +    real    w0,abstol;
 +    int *   iwork;
 +    real *  work;
 +    real    vl,vu;
 +    const char *  jobz;
 +    
 +    if(index_lower<0)
 +        index_lower = 0;
 +    
 +    if(index_upper>=n)
 +        index_upper = n-1;
 +    
 +    /* Make jobz point to the character "V" if eigenvectors
 +     * should be calculated, otherwise "N" (only eigenvalues).
 +     */   
 +    jobz = (eigenvectors != NULL) ? "V" : "N";
 +
 +    /* allocate lapack stuff */
 +    snew(isuppz,2*n);
 +    vl = vu = 0;
 +    
 +    /* First time we ask the routine how much workspace it needs */
 +    lwork  = -1;
 +    liwork = -1;
 +    abstol = 0;
 +    
 +    /* Convert indices to fortran standard */
 +    index_lower++;
 +    index_upper++;
 +    
 +    /* Call LAPACK routine using fortran interface. Note that we use upper storage,
 +     * but this corresponds to lower storage ("L") in Fortran.
 +     */    
 +#ifdef GMX_DOUBLE
 +    F77_FUNC(dsyevr,DSYEVR)(jobz,"I","L",&n,a,&n,&vl,&vu,&index_lower,&index_upper,
 +                            &abstol,&m,eigenvalues,eigenvectors,&n,
 +                            isuppz,&w0,&lwork,&iw0,&liwork,&info);
 +#else
 +    F77_FUNC(ssyevr,SSYEVR)(jobz,"I","L",&n,a,&n,&vl,&vu,&index_lower,&index_upper,
 +                            &abstol,&m,eigenvalues,eigenvectors,&n,
 +                            isuppz,&w0,&lwork,&iw0,&liwork,&info);
 +#endif
 +
 +    if(info != 0)
 +    {
 +        sfree(isuppz);
 +        gmx_fatal(FARGS,"Internal errror in LAPACK diagonalization.");        
 +    }
 +    
 +    lwork = w0;
 +    liwork = iw0;
 +    
 +    snew(work,lwork);
 +    snew(iwork,liwork);
 +    
 +    abstol = 0;
 +    
 +#ifdef GMX_DOUBLE
 +    F77_FUNC(dsyevr,DSYEVR)(jobz,"I","L",&n,a,&n,&vl,&vu,&index_lower,&index_upper,
 +                            &abstol,&m,eigenvalues,eigenvectors,&n,
 +                            isuppz,work,&lwork,iwork,&liwork,&info);
 +#else
 +    F77_FUNC(ssyevr,SSYEVR)(jobz,"I","L",&n,a,&n,&vl,&vu,&index_lower,&index_upper,
 +                            &abstol,&m,eigenvalues,eigenvectors,&n,
 +                            isuppz,work,&lwork,iwork,&liwork,&info);
 +#endif
 +    
 +    sfree(isuppz);
 +    sfree(work);
 +    sfree(iwork);
 +    
 +    if(info != 0)
 +    {
 +        gmx_fatal(FARGS,"Internal errror in LAPACK diagonalization.");
 +    }
 +    
 +}
 +
 +
 +#ifdef GMX_MPI_NOT
 +void 
 +sparse_parallel_eigensolver(gmx_sparsematrix_t *    A,
 +                                                      int                     neig,
 +                                                      real *                  eigenvalues,
 +                                                      real *                  eigenvectors,
 +                                                      int                     maxiter)
 +{
 +    int      iwork[80];
 +    int      iparam[11];
 +    int      ipntr[11];
 +    real *   resid;
 +    real *   workd;
 +    real *   workl;
 +    real *   v;
 +    int      n;
 +    int      ido,info,lworkl,i,ncv,dovec;
 +    real     abstol;
 +    int *    select;
 +    int      iter;
 +    int      nnodes,rank;
 +
 +      MPI_Comm_size( MPI_COMM_WORLD, &nnodes );
 +      MPI_Comm_rank( MPI_COMM_WORLD, &rank );
 +      
 +    if(eigenvectors != NULL)
 +        dovec = 1;
 +    else
 +        dovec = 0;
 +    
 +    n   = A->nrow;
 +    ncv = 2*neig;
 +    
 +    if(ncv>n)
 +        ncv=n;
 +    
 +    for(i=0;i<11;i++)
 +        iparam[i]=ipntr[i]=0;
 +      
 +      iparam[0] = 1;       /* Don't use explicit shifts */
 +      iparam[2] = maxiter; /* Max number of iterations */
 +      iparam[6] = 1;       /* Standard symmetric eigenproblem */
 +    
 +      lworkl = ncv*(8+ncv);
 +    snew(resid,n);
 +    snew(workd,(3*n+4));
 +    snew(workl,lworkl);
 +    snew(select,ncv);
 +    snew(v,n*ncv);
 +      
 +    /* Use machine tolerance - roughly 1e-16 in double precision */
 +    abstol = 0;
 +    
 +      ido = info = 0;
 +    fprintf(stderr,"Calculation Ritz values and Lanczos vectors, max %d iterations...\n",maxiter);
 +    
 +    iter = 1;
 +      do {
 +#ifdef GMX_DOUBLE
 +              F77_FUNC(pdsaupd,PDSAUPD)(&ido, "I", &n, "SA", &neig, &abstol, 
 +                                                                resid, &ncv, v, &n, iparam, ipntr, 
 +                                                                workd, iwork, workl, &lworkl, &info);
 +#else
 +              F77_FUNC(pssaupd,PSSAUPD)(&ido, "I", &n, "SA", &neig, &abstol, 
 +                                                                resid, &ncv, v, &n, iparam, ipntr, 
 +                                                                workd, iwork, workl, &lworkl, &info);
 +#endif
 +        if(ido==-1 || ido==1)
 +            gmx_sparsematrix_vector_multiply(A,workd+ipntr[0]-1, workd+ipntr[1]-1);
 +        
 +        fprintf(stderr,"\rIteration %4d: %3d out of %3d Ritz values converged.",iter++,iparam[4],neig);
 +      } while(info==0 && (ido==-1 || ido==1));
 +      
 +    fprintf(stderr,"\n");
 +      if(info==1)
 +    {
 +          gmx_fatal(FARGS,
 +                  "Maximum number of iterations (%d) reached in Arnoldi\n"
 +                  "diagonalization, but only %d of %d eigenvectors converged.\n",
 +                  maxiter,iparam[4],neig);
 +    }
 +      else if(info!=0)
 +    {
 +        gmx_fatal(FARGS,"Unspecified error from Arnoldi diagonalization:%d\n",info);
 +    }
 +      
 +      info = 0;
 +      /* Extract eigenvalues and vectors from data */
 +    fprintf(stderr,"Calculating eigenvalues and eigenvectors...\n");
 +    
 +#ifdef GMX_DOUBLE
 +    F77_FUNC(pdseupd,PDSEUPD)(&dovec, "A", select, eigenvalues, eigenvectors, 
 +                                                        &n, NULL, "I", &n, "SA", &neig, &abstol, 
 +                                                        resid, &ncv, v, &n, iparam, ipntr, 
 +                                                        workd, workl, &lworkl, &info);
 +#else
 +    F77_FUNC(psseupd,PSSEUPD)(&dovec, "A", select, eigenvalues, eigenvectors, 
 +                                                        &n, NULL, "I", &n, "SA", &neig, &abstol, 
 +                                                        resid, &ncv, v, &n, iparam, ipntr, 
 +                                                        workd, workl, &lworkl, &info);
 +#endif
 +      
 +    sfree(v);
 +    sfree(resid);
 +    sfree(workd);
 +    sfree(workl);  
 +    sfree(select);    
 +}
 +#endif
 +
 +
 +void 
 +sparse_eigensolver(gmx_sparsematrix_t *    A,
 +                   int                     neig,
 +                   real *                  eigenvalues,
 +                   real *                  eigenvectors,
 +                   int                     maxiter)
 +{
 +    int      iwork[80];
 +    int      iparam[11];
 +    int      ipntr[11];
 +    real *   resid;
 +    real *   workd;
 +    real *   workl;
 +    real *   v;
 +    int      n;
 +    int      ido,info,lworkl,i,ncv,dovec;
 +    real     abstol;
 +    int *    select;
 +    int      iter;
 +    
 +#ifdef GMX_MPI_NOT
 +      MPI_Comm_size( MPI_COMM_WORLD, &n );
 +      if(n > 1)
 +      {
 +              sparse_parallel_eigensolver(A,neig,eigenvalues,eigenvectors,maxiter);
 +              return;
 +      }
 +#endif
 +      
 +    if(eigenvectors != NULL)
 +        dovec = 1;
 +    else
 +        dovec = 0;
 +    
 +    n   = A->nrow;
 +    ncv = 2*neig;
 +    
 +    if(ncv>n)
 +        ncv=n;
 +    
 +    for(i=0;i<11;i++)
 +        iparam[i]=ipntr[i]=0;
 +      
 +      iparam[0] = 1;       /* Don't use explicit shifts */
 +      iparam[2] = maxiter; /* Max number of iterations */
 +      iparam[6] = 1;       /* Standard symmetric eigenproblem */
 +    
 +      lworkl = ncv*(8+ncv);
 +    snew(resid,n);
 +    snew(workd,(3*n+4));
 +    snew(workl,lworkl);
 +    snew(select,ncv);
 +    snew(v,n*ncv);
 +
 +    /* Use machine tolerance - roughly 1e-16 in double precision */
 +    abstol = 0;
 +    
 +      ido = info = 0;
 +    fprintf(stderr,"Calculation Ritz values and Lanczos vectors, max %d iterations...\n",maxiter);
 +    
 +    iter = 1;
 +      do {
 +#ifdef GMX_DOUBLE
 +            F77_FUNC(dsaupd,DSAUPD)(&ido, "I", &n, "SA", &neig, &abstol, 
 +                                    resid, &ncv, v, &n, iparam, ipntr, 
 +                                    workd, iwork, workl, &lworkl, &info);
 +#else
 +            F77_FUNC(ssaupd,SSAUPD)(&ido, "I", &n, "SA", &neig, &abstol, 
 +                                    resid, &ncv, v, &n, iparam, ipntr, 
 +                                    workd, iwork, workl, &lworkl, &info);
 +#endif
 +        if(ido==-1 || ido==1)
 +            gmx_sparsematrix_vector_multiply(A,workd+ipntr[0]-1, workd+ipntr[1]-1);
 +        
 +        fprintf(stderr,"\rIteration %4d: %3d out of %3d Ritz values converged.",iter++,iparam[4],neig);
 +      } while(info==0 && (ido==-1 || ido==1));
 +      
 +    fprintf(stderr,"\n");
 +      if(info==1)
 +    {
 +          gmx_fatal(FARGS,
 +                  "Maximum number of iterations (%d) reached in Arnoldi\n"
 +                  "diagonalization, but only %d of %d eigenvectors converged.\n",
 +                  maxiter,iparam[4],neig);
 +    }
 +      else if(info!=0)
 +    {
 +        gmx_fatal(FARGS,"Unspecified error from Arnoldi diagonalization:%d\n",info);
 +    }
 +      
 +      info = 0;
 +      /* Extract eigenvalues and vectors from data */
 +    fprintf(stderr,"Calculating eigenvalues and eigenvectors...\n");
 +    
 +#ifdef GMX_DOUBLE
 +    F77_FUNC(dseupd,DSEUPD)(&dovec, "A", select, eigenvalues, eigenvectors, 
 +                          &n, NULL, "I", &n, "SA", &neig, &abstol, 
 +                          resid, &ncv, v, &n, iparam, ipntr, 
 +                          workd, workl, &lworkl, &info);
 +#else
 +    F77_FUNC(sseupd,SSEUPD)(&dovec, "A", select, eigenvalues, eigenvectors, 
 +                          &n, NULL, "I", &n, "SA", &neig, &abstol, 
 +                          resid, &ncv, v, &n, iparam, ipntr, 
 +                          workd, workl, &lworkl, &info);
 +#endif
 +      
 +    sfree(v);
 +    sfree(resid);
 +    sfree(workd);
 +    sfree(workl);  
 +    sfree(select);    
 +}
 +
 +
index 0a8c68b678ac53b117b74a7a58f2dee4e2b4964e,0000000000000000000000000000000000000000..47c8031582a9ecb0b46c09e36132b5c83cc47bc8
mode 100644,000000..100644
--- /dev/null
@@@ -1,4951 -1,0 +1,4949 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: m; c-basic-offset: 4 -*- 
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2004
 + * David van der Spoel, Erik Lindahl, University of Groningen.
 + *
 + * This file contains a subset of ARPACK functions to perform
 + * diagonalization and SVD for sparse matrices in Gromacs.
 + *
 + * The code has been translated to C to avoid being dependent on
 + * a Fotran compiler, and it has been made threadsafe by using 
 + * additional workspace arrays to store data during reverse communication.
 + *
 + * You might prefer the original ARPACK library for general use, but
 + * in case you want to this version can be redistributed freely, just
 + * as the original library. However, please make clear that it is the
 + * hacked version from Gromacs so any bugs are blamed on us and not
 + * the original authors. You should also be aware that the double
 + * precision work array workd needs to be of size (3*N+4) here
 + * (4 more than the general library), and there is an extra argument
 + * iwork, which should be an integer work array of length 80.
 + * 
 + * ARPACK was written by 
 + *
 + *     Danny Sorensen               Phuong Vu
 + *    Rihard Lehoucq              CRPC / Rice University
 + *    Dept. of Computational &     Houston, Texas
 + *    Applied Mathematics
 + *    Rice University           
 + *    Houston, Texas            
 + */
 +#include <math.h>
 +#include <string.h>
 +
 +#include "gromacs/legacyheaders/types/simple.h"
 +#include "gmx_arpack.h"
 +#include "gmx_blas.h"
 +#include "gmx_lapack.h"
 +static void 
 +F77_FUNC(dstqrb,DSTQRB)(int *      n, 
 +                        double *   d__, 
 +                        double *   e, 
 +                        double *   z__, 
 +                        double *   work, 
 +                        int *      info)
 +{
 +    int i__1, i__2;
 +    double d__1, d__2;
 +    int c__0 = 0;
 +    int c__1 = 1;
 +    double c_b31 = 1.;
 +
 +    double b, c__, f, g;
 +    int i__, j, k, l, m;
 +    double p, r__, s;
 +    int l1, ii, mm, lm1, mm1, nm1;
 +    double rt1, rt2, eps;
 +    int lsv;
 +    double tst, eps2;
 +    int lend, jtot, lendm1, lendp1, iscale;
 +
 +    int lendsv, nmaxit, icompz;
 +    double ssfmax, ssfmin,safmin,minval,safmax,anorm;
 +
 +
 +    --work;
 +    --z__;
 +    --e;
 +    --d__;
 +
 +    *info = 0;
 +
 +    icompz = 2;
 +
 +    if (*n == 0) {
 +      return;
 +    }
 +
 +    if (*n == 1) {
 +      if (icompz == 2) {
 +          z__[1] = 1.;
 +      }
 +      return;
 +    }
 +
 +    eps = GMX_DOUBLE_EPS;
 +
 +    d__1 = eps;
 +    eps2 = d__1 * d__1;
 +    minval = GMX_DOUBLE_MIN;
 +    safmin = minval / GMX_DOUBLE_EPS;
 +    safmax = 1. / safmin;
 +    ssfmax = sqrt(safmax) / 3.;
 +    ssfmin = sqrt(safmin) / eps2;
 +
 +    if (icompz == 2) {
 +      i__1 = *n - 1;
 +      for (j = 1; j <= i__1; ++j) {
 +          z__[j] = 0.;
 +
 +      }
 +      z__[*n] = 1.;
 +    }
 +
 +    nmaxit = *n * 30;
 +    jtot = 0;
 +
 +    l1 = 1;
 +    nm1 = *n - 1;
 +
 +L10:
 +    if (l1 > *n) {
 +      goto L160;
 +    }
 +    if (l1 > 1) {
 +      e[l1 - 1] = 0.;
 +    }
 +    if (l1 <= nm1) {
 +      i__1 = nm1;
 +      for (m = l1; m <= i__1; ++m) {
 +        tst = fabs(e[m]);
 +          if (tst == 0.) {
 +              goto L30;
 +          }
 +          if (tst <= sqrt(fabs(d__[m])) * sqrt(fabs(d__[m+1])) * eps) {
 +            e[m] = 0.;
 +            goto L30;
 +          }
 +      }
 +    }
 +    m = *n;
 +
 +L30:
 +    l = l1;
 +    lsv = l;
 +    lend = m;
 +    lendsv = lend;
 +    l1 = m + 1;
 +    if (lend == l) {
 +      goto L10;
 +    }
 +
 +    i__1 = lend - l + 1;
 +    anorm =F77_FUNC(dlanst,DLANST)("i", &i__1, &d__[l], &e[l]);
 +    iscale = 0;
 +    if (anorm == 0.) {
 +      goto L10;
 +    }
 +    if (anorm > ssfmax) {
 +      iscale = 1;
 +      i__1 = lend - l + 1;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n, 
 +              info);
 +      i__1 = lend - l;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n, 
 +              info);
 +    } else if (anorm < ssfmin) {
 +      iscale = 2;
 +      i__1 = lend - l + 1;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n, 
 +              info);
 +      i__1 = lend - l;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n, 
 +              info);
 +    }
 +
 +    if (fabs(d__[lend]) < fabs(d__[l])) {
 +      lend = lsv;
 +      l = lendsv;
 +    }
 +
 +    if (lend > l) {
 +
 +L40:
 +      if (l != lend) {
 +          lendm1 = lend - 1;
 +          i__1 = lendm1;
 +          for (m = l; m <= i__1; ++m) {
 +              d__2 = fabs(e[m]);
 +              tst = d__2 * d__2;
 +              if (tst <= eps2 * fabs(d__[m]) * fabs(d__[m + 1]) + safmin) {
 +                  goto L60;
 +              }
 +          }
 +      }
 +
 +      m = lend;
 +
 +L60:
 +      if (m < lend) {
 +          e[m] = 0.;
 +      }
 +      p = d__[l];
 +      if (m == l) {
 +          goto L80;
 +      }
 +
 +      if (m == l + 1) {
 +          if (icompz > 0) {
 +              F77_FUNC(dlaev2,DLAEV2)(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
 +              work[l] = c__;
 +              work[*n - 1 + l] = s;
 +
 +              tst = z__[l + 1];
 +              z__[l + 1] = c__ * tst - s * z__[l];
 +              z__[l] = s * tst + c__ * z__[l];
 +          } else {
 +              F77_FUNC(dlae2,DLAE2)(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
 +          }
 +          d__[l] = rt1;
 +          d__[l + 1] = rt2;
 +          e[l] = 0.;
 +          l += 2;
 +          if (l <= lend) {
 +              goto L40;
 +          }
 +          goto L140;
 +      }
 +
 +      if (jtot == nmaxit) {
 +          goto L140;
 +      }
 +      ++jtot;
 +
 +      g = (d__[l + 1] - p) / (e[l] * 2.);
 +      r__ =F77_FUNC(dlapy2,DLAPY2)(&g, &c_b31);
 +      g = d__[m] - p + e[l] / (g + ((g>0) ? r__ : -r__ ));
 +
 +      s = 1.;
 +      c__ = 1.;
 +      p = 0.;
 +
 +      mm1 = m - 1;
 +      i__1 = l;
 +      for (i__ = mm1; i__ >= i__1; --i__) {
 +          f = s * e[i__];
 +          b = c__ * e[i__];
 +         F77_FUNC(dlartg,DLARTG)(&g, &f, &c__, &s, &r__);
 +          if (i__ != m - 1) {
 +              e[i__ + 1] = r__;
 +          }
 +          g = d__[i__ + 1] - p;
 +          r__ = (d__[i__] - g) * s + c__ * 2. * b;
 +          p = s * r__;
 +          d__[i__ + 1] = g + p;
 +          g = c__ * r__ - b;
 +
 +          if (icompz > 0) {
 +              work[i__] = c__;
 +              work[*n - 1 + i__] = -s;
 +          }
 +
 +      }
 +
 +      if (icompz > 0) {
 +          mm = m - l + 1;
 +
 +         F77_FUNC(dlasr,DLASR)("r", "v", "b", &c__1, &mm, &work[l], &work[*n - 1 + l], &
 +                  z__[l], &c__1);
 +      }
 +
 +      d__[l] -= p;
 +      e[l] = g;
 +      goto L40;
 +
 +L80:
 +      d__[l] = p;
 +
 +      ++l;
 +      if (l <= lend) {
 +          goto L40;
 +      }
 +      goto L140;
 +
 +    } else {
 +
 +L90:
 +      if (l != lend) {
 +          lendp1 = lend + 1;
 +          i__1 = lendp1;
 +          for (m = l; m >= i__1; --m) {
 +              d__2 = fabs(e[m - 1]);
 +              tst = d__2 * d__2;
 +              if (tst <= eps2 * fabs(d__[m]) * fabs(d__[m- 1]) + safmin) {
 +                  goto L110;
 +              }
 +          }
 +      }
 +
 +      m = lend;
 +
 +L110:
 +      if (m > lend) {
 +          e[m - 1] = 0.;
 +      }
 +      p = d__[l];
 +      if (m == l) {
 +          goto L130;
 +      }
 +
 +      if (m == l - 1) {
 +          if (icompz > 0) {
 +              F77_FUNC(dlaev2,DLAEV2)(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
 +                      ;
 +
 +              tst = z__[l];
 +              z__[l] = c__ * tst - s * z__[l - 1];
 +              z__[l - 1] = s * tst + c__ * z__[l - 1];
 +          } else {
 +              F77_FUNC(dlae2,DLAE2)(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
 +          }
 +          d__[l - 1] = rt1;
 +          d__[l] = rt2;
 +          e[l - 1] = 0.;
 +          l += -2;
 +          if (l >= lend) {
 +              goto L90;
 +          }
 +          goto L140;
 +      }
 +
 +      if (jtot == nmaxit) {
 +          goto L140;
 +      }
 +      ++jtot;
 +
 +
 +      g = (d__[l - 1] - p) / (e[l - 1] * 2.);
 +      r__ =F77_FUNC(dlapy2,DLAPY2)(&g, &c_b31);
 +      g = d__[m] - p + e[l - 1] / (g + ((g>0) ? r__ : -r__ ));
 +
 +      s = 1.;
 +      c__ = 1.;
 +      p = 0.;
 +
 +      lm1 = l - 1;
 +      i__1 = lm1;
 +      for (i__ = m; i__ <= i__1; ++i__) {
 +          f = s * e[i__];
 +          b = c__ * e[i__];
 +         F77_FUNC(dlartg,DLARTG)(&g, &f, &c__, &s, &r__);
 +          if (i__ != m) {
 +              e[i__ - 1] = r__;
 +          }
 +          g = d__[i__] - p;
 +          r__ = (d__[i__ + 1] - g) * s + c__ * 2. * b;
 +          p = s * r__;
 +          d__[i__] = g + p;
 +          g = c__ * r__ - b;
 +
 +          if (icompz > 0) {
 +              work[i__] = c__;
 +              work[*n - 1 + i__] = s;
 +          }
 +
 +      }
 +
 +      if (icompz > 0) {
 +          mm = l - m + 1;
 +
 +         F77_FUNC(dlasr,DLASR)("r", "v", "f", &c__1, &mm, &work[m], &work[*n - 1 + m], &
 +                  z__[m], &c__1);
 +      }
 +
 +      d__[l] -= p;
 +      e[lm1] = g;
 +      goto L90;
 +
 +L130:
 +      d__[l] = p;
 +
 +      --l;
 +      if (l >= lend) {
 +          goto L90;
 +      }
 +      goto L140;
 +
 +    }
 +
 +L140:
 +    if (iscale == 1) {
 +      i__1 = lendsv - lsv + 1;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv], 
 +              n, info);
 +      i__1 = lendsv - lsv;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n, 
 +              info);
 +    } else if (iscale == 2) {
 +      i__1 = lendsv - lsv + 1;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv], 
 +              n, info);
 +      i__1 = lendsv - lsv;
 +      F77_FUNC(dlascl,DLASCL)("g", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n, 
 +              info);
 +    }
 +
 +    if (jtot < nmaxit) {
 +      goto L10;
 +    }
 +    i__1 = *n - 1;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +      if (e[i__] != 0.) {
 +          ++(*info);
 +      }
 +    }
 +    goto L190;
 +
 +L160:
 +    if (icompz == 0) {
 +
 +      F77_FUNC(dlasrt,DLASRT)("i", n, &d__[1], info);
 +
 +    } else {
 +
 +      i__1 = *n;
 +      for (ii = 2; ii <= i__1; ++ii) {
 +          i__ = ii - 1;
 +          k = i__;
 +          p = d__[i__];
 +          i__2 = *n;
 +          for (j = ii; j <= i__2; ++j) {
 +              if (d__[j] < p) {
 +                  k = j;
 +                  p = d__[j];
 +              }
 +          }
 +          if (k != i__) {
 +              d__[k] = d__[i__];
 +              d__[i__] = p;
 +
 +              p = z__[k];
 +              z__[k] = z__[i__];
 +              z__[i__] = p;
 +          }
 +      }
 +    }
 +
 +L190:
 +    return;
 +
 +}
 +
 +static void 
 +F77_FUNC(dgetv0,DGETV0)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     itry, 
 +                        int *     initv, 
 +                        int *     n, 
 +                        int *     j, 
 +                        double *  v, 
 +                        int *     ldv, 
 +                        double *  resid, 
 +                        double *  rnorm, 
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        int *     iwork, 
 +                        int *     ierr)
 +{
 +    int c__1 = 1;
 +    double c_b22 = 1.;
 +    double c_b24 = 0.;
 +    double c_b27 = -1.;
 +    int v_dim1, v_offset, i__1;
 +
 +    int jj;
 +    int idist;
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --ipntr;
 +    --iwork;
 +
 +    if (*ido == 0) {
 +
 +      *ierr = 0;
 +      iwork[7] = 0;
 +      iwork[5] = 0;
 +      iwork[6] = 0;
 +
 +      if (! (*initv)) {
 +          idist = 2;
 +         F77_FUNC(dlarnv,DLARNV)(&idist, &iwork[1], n, &resid[1]);
 +      }
 +
 +      if (*bmat == 'G') {
 +          ipntr[1] = 1;
 +          ipntr[2] = *n + 1;
 +         F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +          *ido = -1;
 +          goto L9000;
 +      }
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L20;
 +    }
 +
 +    if (iwork[6] == 1) {
 +      goto L40;
 +    }
 +
 +    iwork[5] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(dcopy,DCOPY)(n, &workd[*n + 1], &c__1, &resid[1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L20:
 +
 +
 +    iwork[5] = 0;
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 4] =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      workd[*n * 3 + 4] = sqrt(fabs(workd[*n * 3 + 4]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 4] =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +    *rnorm = workd[*n * 3 + 4];
 +
 +    if (*j == 1) {
 +      goto L50;
 +    }
 +    iwork[6] = 1;
 +L30:
 +
 +    i__1 = *j - 1;
 +   F77_FUNC(dgemv,DGEMV)("T", n, &i__1, &c_b22, &v[v_offset], ldv, &workd[1], &c__1, &c_b24,
 +           &workd[*n + 1], &c__1);
 +    i__1 = *j - 1;
 +   F77_FUNC(dgemv,DGEMV)("N", n, &i__1, &c_b27, &v[v_offset], ldv, &workd[*n + 1], &c__1, &
 +          c_b22, &resid[1], &c__1);
 +
 +    if (*bmat == 'G') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[*n + 1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L40:
 +
 +    if (*bmat == 'G') {
 +      *rnorm =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      *rnorm = sqrt(fabs(*rnorm));
 +    } else if (*bmat == 'I') {
 +      *rnorm =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*rnorm > workd[*n * 3 + 4] * .717f) {
 +      goto L50;
 +    }
 +
 +    ++iwork[7];
 +    if (iwork[7] <= 1) {
 +
 +      workd[*n * 3 + 4] = *rnorm;
 +      goto L30;
 +    } else {
 +
 +      i__1 = *n;
 +      for (jj = 1; jj <= i__1; ++jj) {
 +          resid[jj] = 0.;
 +      }
 +      *rnorm = 0.;
 +      *ierr = -1;
 +    }
 +
 +L50:
 +
 +    *ido = 99;
 +
 +L9000:
 +    return;
 +}
 +
 +
 +
 +
 +
 +static void 
 +F77_FUNC(dsapps,DSAPPS)(int *     n, 
 +                        int *     kev, 
 +                        int *     np, 
 +                        double *  shift, 
 +                        double *  v, 
 +                        int *     ldv, 
 +                        double *  h__, 
 +                        int *     ldh, 
 +                        double *  resid, 
 +                        double *  q, 
 +                        int *     ldq, 
 +                        double *  workd)
 +{
 +    double c_b4 = 0.;
 +    double c_b5 = 1.;
 +    double c_b14 = -1.;
 +    int c__1 = 1;
 +    int h_dim1, h_offset, q_dim1, q_offset, v_dim1, v_offset, i__1, i__2, 
 +          i__3, i__4;
 +    double c__, f, g;
 +    int i__, j;
 +    double r__, s, a1, a2, a3, a4;
 +    int jj;
 +    double big;
 +    int iend, itop;
 +    double epsmch;
 +    int istart, kplusp;
 +
 +    --workd;
 +    --resid;
 +    --shift;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    q_dim1 = *ldq;
 +    q_offset = 1 + q_dim1;
 +    q -= q_offset;
 +
 +    epsmch = GMX_DOUBLE_EPS;
 +    itop = 1;
 +
 +
 +    kplusp = *kev + *np;
 +
 +   F77_FUNC(dlaset,DLASET)("All", &kplusp, &kplusp, &c_b4, &c_b5, &q[q_offset], ldq);
 +
 +    if (*np == 0) {
 +      goto L9000;
 +    }
 +
 +    i__1 = *np;
 +    for (jj = 1; jj <= i__1; ++jj) {
 +
 +      istart = itop;
 +
 +L20:
 +
 +      i__2 = kplusp - 1;
 +      for (i__ = istart; i__ <= i__2; ++i__) {
 +        big = fabs(h__[i__ + (h_dim1*2)]) + fabs(h__[i__ + 1 + (h_dim1*2)]);
 +        if (h__[i__ + 1 + h_dim1] <= epsmch * big) {
 +          h__[i__ + 1 + h_dim1] = 0.;
 +          iend = i__;
 +          goto L40;
 +        }
 +      }
 +      iend = kplusp;
 +L40:
 +
 +      if (istart < iend) {
 +
 +          f = h__[istart + (h_dim1 << 1)] - shift[jj];
 +          g = h__[istart + 1 + h_dim1];
 +         F77_FUNC(dlartg,DLARTG)(&f, &g, &c__, &s, &r__);
 +
 +          a1 = c__ * h__[istart + (h_dim1 << 1)] + s * h__[istart + 1 + 
 +                  h_dim1];
 +          a2 = c__ * h__[istart + 1 + h_dim1] + s * h__[istart + 1 + (
 +                  h_dim1 << 1)];
 +          a4 = c__ * h__[istart + 1 + (h_dim1 << 1)] - s * h__[istart + 1 + 
 +                  h_dim1];
 +          a3 = c__ * h__[istart + 1 + h_dim1] - s * h__[istart + (h_dim1 << 
 +                  1)];
 +          h__[istart + (h_dim1 << 1)] = c__ * a1 + s * a2;
 +          h__[istart + 1 + (h_dim1 << 1)] = c__ * a4 - s * a3;
 +          h__[istart + 1 + h_dim1] = c__ * a3 + s * a4;
 +
 +          i__3 = istart + jj;
 +          i__2 = (i__3<kplusp) ? i__3 : kplusp;
 +          for (j = 1; j <= i__2; ++j) {
 +              a1 = c__ * q[j + istart * q_dim1] + s * q[j + (istart + 1) * 
 +                      q_dim1];
 +              q[j + (istart + 1) * q_dim1] = -s * q[j + istart * q_dim1] + 
 +                      c__ * q[j + (istart + 1) * q_dim1];
 +              q[j + istart * q_dim1] = a1;
 +
 +          }
 +
 +          i__2 = iend - 1;
 +          for (i__ = istart + 1; i__ <= i__2; ++i__) {
 +
 +              f = h__[i__ + h_dim1];
 +              g = s * h__[i__ + 1 + h_dim1];
 +
 +              h__[i__ + 1 + h_dim1] = c__ * h__[i__ + 1 + h_dim1];
 +              F77_FUNC(dlartg,DLARTG)(&f, &g, &c__, &s, &r__);
 +
 +              if (r__ < 0.) {
 +                  r__ = -r__;
 +                  c__ = -c__;
 +                  s = -s;
 +              }
 +
 +              h__[i__ + h_dim1] = r__;
 +
 +              a1 = c__ * h__[i__ + (h_dim1 << 1)] + s * h__[i__ + 1 + 
 +                      h_dim1];
 +              a2 = c__ * h__[i__ + 1 + h_dim1] + s * h__[i__ + 1 + (h_dim1 
 +                      << 1)];
 +              a3 = c__ * h__[i__ + 1 + h_dim1] - s * h__[i__ + (h_dim1 << 1)
 +                      ];
 +              a4 = c__ * h__[i__ + 1 + (h_dim1 << 1)] - s * h__[i__ + 1 + 
 +                      h_dim1];
 +
 +              h__[i__ + (h_dim1 << 1)] = c__ * a1 + s * a2;
 +              h__[i__ + 1 + (h_dim1 << 1)] = c__ * a4 - s * a3;
 +              h__[i__ + 1 + h_dim1] = c__ * a3 + s * a4;
 +
 +              i__4 = j + jj;
 +              i__3 = (i__4<kplusp) ? i__4 : kplusp;
 +              for (j = 1; j <= i__3; ++j) {
 +                  a1 = c__ * q[j + i__ * q_dim1] + s * q[j + (i__ + 1) * 
 +                          q_dim1];
 +                  q[j + (i__ + 1) * q_dim1] = -s * q[j + i__ * q_dim1] + 
 +                          c__ * q[j + (i__ + 1) * q_dim1];
 +                  q[j + i__ * q_dim1] = a1;
 +              }
 +
 +          }
 +
 +      }
 +
 +      istart = iend + 1;
 +
 +      if (h__[iend + h_dim1] < 0.) {
 +          h__[iend + h_dim1] = -h__[iend + h_dim1];
 +         F77_FUNC(dscal,DSCAL)(&kplusp, &c_b14, &q[iend * q_dim1 + 1], &c__1);
 +      }
 +
 +      if (iend < kplusp) {
 +          goto L20;
 +      }
 +
 +      i__2 = kplusp - 1;
 +      for (i__ = itop; i__ <= i__2; ++i__) {
 +          if (h__[i__ + 1 + h_dim1] > 0.) {
 +              goto L90;
 +          }
 +          ++itop;
 +      }
 +
 +L90:
 +      ;
 +    }
 +
 +    i__1 = kplusp - 1;
 +    for (i__ = itop; i__ <= i__1; ++i__) {
 +      big = fabs(h__[i__ + (h_dim1*2)]) + fabs(h__[i__+ 1 + (h_dim1*2)]);
 +      if (h__[i__ + 1 + h_dim1] <= epsmch * big) {
 +          h__[i__ + 1 + h_dim1] = 0.;
 +      }
 +
 +    }
 +
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(dgemv,DGEMV)("N", n, &kplusp, &c_b5, &v[v_offset], ldv, &q[(*kev + 1) * 
 +              q_dim1 + 1], &c__1, &c_b4, &workd[*n + 1], &c__1);
 +    }
 +
 +    i__1 = *kev;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +      i__2 = kplusp - i__ + 1;
 +      F77_FUNC(dgemv,DGEMV)("N", n, &i__2, &c_b5, &v[v_offset], ldv, &q[(*kev - i__ + 1) * 
 +              q_dim1 + 1], &c__1, &c_b4, &workd[1], &c__1);
 +      F77_FUNC(dcopy,DCOPY)(n, &workd[1], &c__1, &v[(kplusp - i__ + 1) * v_dim1 + 1], &
 +              c__1);
 +
 +    }
 +
 +   F77_FUNC(dlacpy,DLACPY)("All", n, kev, &v[(*np + 1) * v_dim1 + 1], ldv, &v[v_offset], ldv);
 +
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(dcopy,DCOPY)(n, &workd[*n + 1], &c__1, &v[(*kev + 1) * v_dim1 + 1], &c__1);
 +    }
 +
 +   F77_FUNC(dscal,DSCAL)(n, &q[kplusp + *kev * q_dim1], &resid[1], &c__1);
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(daxpy,DAXPY)(n, &h__[*kev + 1 + h_dim1], &v[(*kev + 1) * v_dim1 + 1], &c__1,
 +               &resid[1], &c__1);
 +    }
 +
 +
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +static void 
 +F77_FUNC(dsortr,DSORTR)(const char *    which, 
 +                        int *     apply, 
 +                        int *     n, 
 +                        double *  x1, 
 +                        double *  x2)
 +{
 +    int i__1;
 +
 +    int i__, j, igap;
 +    double temp;
 +
 +
 +
 +    igap = *n / 2;
 +
 +    if (!strncmp(which, "SA", 2)) {
 +
 +L10:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L20:
 +
 +          if (j < 0) {
 +              goto L30;
 +          }
 +
 +          if (x1[j] < x1[j + igap]) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap]; 
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L30;
 +          }
 +          j -= igap;
 +          goto L20;
 +L30:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L10;
 +
 +    } else if (!strncmp(which, "SM", 2)) {
 +
 +L40:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L50:
 +
 +          if (j < 0) {
 +              goto L60;
 +          }
 +
 +          if (fabs(x1[j]) < fabs(x1[j + igap])) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L60;
 +          }
 +          j -= igap;
 +          goto L50;
 +L60:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L40;
 +
 +    } else if (!strncmp(which, "LA", 2)) {
 +
 +L70:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L80:
 +
 +          if (j < 0) {
 +              goto L90;
 +          }
 +
 +          if (x1[j] > x1[j + igap]) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L90;
 +          }
 +          j -= igap;
 +          goto L80;
 +L90:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L70;
 +
 +    } else if (!strncmp(which, "LM", 2)) {
 +
 +
 +L100:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L110:
 +
 +          if (j < 0) {
 +              goto L120;
 +          }
 +
 +          if (fabs(x1[j]) > fabs(x1[j + igap])) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L120;
 +          }
 +          j -= igap;
 +          goto L110;
 +L120:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L100;
 +    }
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +
 +static void 
 +F77_FUNC(dsesrt,DSESRT)(const char *    which, 
 +                        int *     apply, 
 +                        int *     n, 
 +                        double *  x, 
 +                        int *     na, 
 +                        double *  a, 
 +                        int *     lda)
 +{
 +    int a_dim1, a_offset, i__1;
 +    int c__1 = 1;
 +
 +    int i__, j, igap;
 +    double temp;
 +
 +    a_dim1 = *lda;
 +    a_offset = 1 + a_dim1 * 0;
 +    a -= a_offset;
 +
 +    igap = *n / 2;
 +
 +    if (!strncmp(which, "SA", 2)) {
 +
 +L10:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L20:
 +
 +          if (j < 0) {
 +              goto L30;
 +          }
 +
 +          if (x[j] < x[j + igap]) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(dswap,DSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L30;
 +          }
 +          j -= igap;
 +          goto L20;
 +L30:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L10;
 +
 +    } else if (!strncmp(which, "SM", 2)) {
 +
 +L40:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L50:
 +
 +          if (j < 0) {
 +              goto L60;
 +          }
 +
 +          if (fabs(x[j]) < fabs(x[j + igap])) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(dswap,DSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L60;
 +          }
 +          j -= igap;
 +          goto L50;
 +L60:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L40;
 +
 +    } else if (!strncmp(which, "LA", 2)) {
 +
 +L70:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L80:
 +
 +          if (j < 0) {
 +              goto L90;
 +          }
 +
 +          if (x[j] > x[j + igap]) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(dswap,DSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L90;
 +          }
 +          j -= igap;
 +          goto L80;
 +L90:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L70;
 +
 +    } else if (!strncmp(which, "LM", 2)) {
 +
 +L100:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L110:
 +
 +          if (j < 0) {
 +              goto L120;
 +          }
 +
 +          if (fabs(x[j]) > fabs(x[j + igap])) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(dswap,DSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L120;
 +          }
 +          j -= igap;
 +          goto L110;
 +L120:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L100;
 +    }
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +
 +static void
 +F77_FUNC(dsgets,DSGETS)(int *     ishift, 
 +                        const char *    which, 
 +                        int *     kev, 
 +                        int *     np, 
 +                        double *  ritz, 
 +                        double *  bounds, 
 +                        double *  shifts)
 +{
 +    int c__1 = 1;
 +    int i__1, i__2;
 +    int kevd2;
 +
 +    --shifts;
 +    --bounds;
 +    --ritz;
 +
 +    if (!strncmp(which, "BE", 2)) {
 +      i__1 = *kev + *np;
 +      F77_FUNC(dsortr,DSORTR)("LA", &c__1, &i__1, &ritz[1], &bounds[1]);
 +      kevd2 = *kev / 2;
 +      if (*kev > 1) {
 +        i__1 = (kevd2<*np) ? kevd2 : *np;
 +        i__2 = (kevd2>*np) ? kevd2 : *np;
 +        F77_FUNC(dswap,DSWAP)(&i__1, &ritz[1], &c__1, 
 +                              &ritz[i__2 + 1], &c__1);
 +        i__1 = (kevd2<*np) ? kevd2 : *np;
 +        i__2 = (kevd2>*np) ? kevd2 : *np;
 +        F77_FUNC(dswap,DSWAP)(&i__1, &bounds[1], &c__1, 
 +                              &bounds[i__2 + 1], &c__1);
 +      }
 +
 +    } else {
 +      i__1 = *kev + *np;
 +      F77_FUNC(dsortr,DSORTR)(which, &c__1, &i__1, &ritz[1], &bounds[1]);
 +    }
 +
 +    if (*ishift == 1 && *np > 0) {
 +
 +      F77_FUNC(dsortr,DSORTR)("SM", &c__1, np, &bounds[1], &ritz[1]);
 +      F77_FUNC(dcopy,DCOPY)(np, &ritz[1], &c__1, &shifts[1], &c__1);
 +    }
 +
 +
 +    return;
 +} 
 +
 +
 +
 +static void 
 +F77_FUNC(dsconv,DSCONV)(int *     n, 
 +                        double *  ritz, 
 +                        double *  bounds,
 +                        double *  tol, 
 +                        int *     nconv)
 +{
 +    double c_b3 = 2/3;
 +    int i__1;
 +    double d__2, d__3;
 +
 +    int i__;
 +    double eps23, temp;
 + 
 +    --bounds;
 +    --ritz;
 +
 +    eps23 = GMX_DOUBLE_EPS;
 +    eps23 = pow(eps23, c_b3);
 +
 +    *nconv = 0;
 +    i__1 = *n;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +
 +      d__2 = eps23;
 +      d__3 = fabs(ritz[i__]);
 +      temp = (d__2 > d__3) ? d__2 : d__3;
 +      if (bounds[i__] <= *tol * temp) {
 +      ++(*nconv);
 +      }
 +    }
 +
 +    return;
 +}
 +
 +
 +static void 
 +F77_FUNC(dseigt,DSEIGT)(double *  rnorm,
 +                        int *     n, 
 +                        double *  h__, 
 +                        int *     ldh, 
 +                        double *  eig, 
 +                        double *  bounds, 
 +                        double *  workl, 
 +                        int *     ierr)
 +{
 +    int c__1 = 1;
 +    int h_dim1, h_offset, i__1;
 +
 +    int k;
 +
 +
 +    --workl;
 +    --bounds;
 +    --eig;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +
 +   F77_FUNC(dcopy,DCOPY)(n, &h__[(h_dim1 << 1) + 1], &c__1, &eig[1], &c__1);
 +    i__1 = *n - 1;
 +   F77_FUNC(dcopy,DCOPY)(&i__1, &h__[h_dim1 + 2], &c__1, &workl[1], &c__1);
 +   F77_FUNC(dstqrb,DSTQRB)(n, &eig[1], &workl[1], &bounds[1], &workl[*n + 1], ierr);
 +   if (*ierr != 0) {
 +       goto L9000;
 +   }
 +   
 +   i__1 = *n;
 +   for (k = 1; k <= i__1; ++k) {
 +       bounds[k] = *rnorm * fabs(bounds[k]);
 +       
 +   }
 +   
 +   
 +L9000:
 +       return;
 +}
 +
 +
 +
 +
 +static void 
 +F77_FUNC(dsaitr,DSAITR)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        int *     k,
 +                        int *     np, 
 +                        int *     mode, 
 +                        double *  resid, 
 +                        double *  rnorm, 
 +                        double *  v, 
 +                        int *     ldv, 
 +                        double *  h__, 
 +                        int *     ldh, 
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        int *     iwork, 
 +                        int *     info)
 +{
 +
 +    int c__0 = 0;
 +    int c__1 = 1;
 +    double c_b18 = 1.;
 +    double c_b42 = 0.;
 +    double c_b50 = -1.;
 +
 +    int h_dim1, h_offset, v_dim1, v_offset, i__1;
 +    int i__, jj;
 +    double temp1;
 +    int infol;
 +    double safmin,minval;
 +
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    --ipntr;
 +    --iwork;
 +    minval = GMX_DOUBLE_MIN;
 +    safmin = minval / GMX_DOUBLE_EPS;
 +
 +    if (*ido == 0) {
 +      *info = 0;
 +      iwork[5] = 0;
 +      iwork[6] = 0;
 +      iwork[4] = 0;
 +      iwork[2] = 0;
 +      iwork[3] = 0;
 +
 +      iwork[12] = *k + 1;
 +
 +      iwork[8] = 1;
 +      iwork[9] = iwork[8] + *n;
 +      iwork[10] = iwork[9] + *n;
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L50;
 +    }
 +    if (iwork[6] == 1) {
 +      goto L60;
 +    }
 +    if (iwork[2] == 1) {
 +      goto L70;
 +    }
 +    if (iwork[3] == 1) {
 +      goto L90;
 +    }
 +    if (iwork[4] == 1) {
 +      goto L30;
 +    }
 +
 +L1000:
 +
 +
 +    if (*rnorm > 0.) {
 +      goto L40;
 +    }
 +
 +    iwork[11] = 1;
 +L20:
 +    iwork[4] = 1;
 +    *ido = 0;
 +L30:
 +
 +    F77_FUNC(dgetv0,DGETV0)(ido, bmat, &iwork[11], &c__0, n, &iwork[12], &v[v_offset], ldv,
 +                            &resid[1], rnorm, &ipntr[1], &workd[1], &iwork[21], &iwork[7]);
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +    if (iwork[7] < 0) {
 +      ++iwork[11];
 +      if (iwork[11] <= 3) {
 +          goto L20;
 +      }
 +
 +      *info = iwork[12] - 1;
 +      *ido = 99;
 +      goto L9000;
 +    }
 +
 +L40:
 +
 +   F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &v[iwork[12] * v_dim1 + 1], &c__1);
 +    if (*rnorm >= safmin) {
 +      temp1 = 1. / *rnorm;
 +      F77_FUNC(dscal,DSCAL)(n, &temp1, &v[iwork[12] * v_dim1 + 1], &c__1);
 +      F77_FUNC(dscal,DSCAL)(n, &temp1, &workd[iwork[8]], &c__1);
 +    } else {
 +
 +      F77_FUNC(dlascl,DLASCL)("General", &i__, &i__, rnorm, &c_b18, n, &c__1, &v[iwork[12] *
 +               v_dim1 + 1], n, &infol);
 +      F77_FUNC(dlascl,DLASCL)("General", &i__, &i__, rnorm, &c_b18, n, &c__1, &workd[iwork[
 +              8]], n, &infol);
 +    }
 +
 +    iwork[5] = 1;
 +   F77_FUNC(dcopy,DCOPY)(n, &v[iwork[12] * v_dim1 + 1], &c__1, &workd[iwork[10]], &c__1);
 +    ipntr[1] = iwork[10];
 +    ipntr[2] = iwork[9];
 +    ipntr[3] = iwork[8];
 +    *ido = 1;
 +
 +    goto L9000;
 +L50:
 +
 +
 +    iwork[5] = 0;
 +
 +   F77_FUNC(dcopy,DCOPY)(n, &workd[iwork[9]], &c__1, &resid[1], &c__1);
 +
 +    if (*mode == 2) {
 +      goto L65;
 +    }
 +    if (*bmat == 'G') {
 +      iwork[6] = 1;
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L60:
 +
 +    iwork[6] = 0;
 +
 +L65:
 +    if (*mode == 2) {
 +
 +      workd[*n * 3 + 3] =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[iwork[10]], &
 +              c__1);
 +      workd[*n * 3 + 3] = sqrt(fabs(workd[*n * 3 + 3]));
 +    } else if (*bmat == 'G') {
 +      workd[*n * 3 + 3] =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &
 +              c__1);
 +      workd[*n * 3 + 3] = sqrt(fabs(workd[*n * 3 + 3]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 3] =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*mode != 2) {
 +      F77_FUNC(dgemv,DGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[8]]
 +              , &c__1, &c_b42, &workd[iwork[9]], &c__1);
 +    } else if (*mode == 2) {
 +      F77_FUNC(dgemv,DGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[10]
 +              ], &c__1, &c_b42, &workd[iwork[9]], &c__1);
 +    }
 +
 +   F77_FUNC(dgemv,DGEMV)("N", n, &iwork[12], &c_b50, &v[v_offset], ldv, &workd[iwork[9]], &
 +          c__1, &c_b18, &resid[1], &c__1);
 +
 +    h__[iwork[12] + (h_dim1 << 1)] = workd[iwork[9] + iwork[12] - 1];
 +    if (iwork[12] == 1 || iwork[4] == 1) {
 +      h__[iwork[12] + h_dim1] = 0.;
 +    } else {
 +      h__[iwork[12] + h_dim1] = *rnorm;
 +    }
 +
 +    iwork[2] = 1;
 +    iwork[1] = 0;
 +
 +    if (*bmat == 'G') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[iwork[9]], &c__1);
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L70:
 +
 +    iwork[2] = 0;
 +
 +    if (*bmat == 'G') {
 +      *rnorm =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +      *rnorm = sqrt(fabs(*rnorm));
 +    } else if (*bmat == 'I') {
 +      *rnorm =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*rnorm > workd[*n * 3 + 3] * .717f) {
 +      goto L100;
 +    }
 +
 +L80:
 +
 +   F77_FUNC(dgemv,DGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[8]], &
 +          c__1, &c_b42, &workd[iwork[9]], &c__1);
 +
 +   F77_FUNC(dgemv,DGEMV)("N", n, &iwork[12], &c_b50, &v[v_offset], ldv, &workd[iwork[9]], &
 +          c__1, &c_b18, &resid[1], &c__1);
 +
 +    if (iwork[12] == 1 || iwork[4] == 1) {
 +      h__[iwork[12] + h_dim1] = 0.;
 +    }
 +    h__[iwork[12] + (h_dim1 << 1)] += workd[iwork[9] + iwork[12] - 1];
 +
 +    iwork[3] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[iwork[9]], &c__1);
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L90:
 +
 +
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 2] =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &
 +              c__1);
 +      workd[*n * 3 + 2] = sqrt(fabs(workd[*n * 3 + 2]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 2] =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +
 +    if (workd[*n * 3 + 2] > *rnorm * .717f) {
 +
 +      *rnorm = workd[*n * 3 + 2];
 +
 +    } else {
 +
 +      *rnorm = workd[*n * 3 + 2];
 +      ++iwork[1];
 +      if (iwork[1] <= 1) {
 +          goto L80;
 +      }
 +
 +      i__1 = *n;
 +      for (jj = 1; jj <= i__1; ++jj) {
 +          resid[jj] = 0.;
 +      }
 +      *rnorm = 0.;
 +    }
 +
 +L100:
 +
 +    iwork[4] = 0;
 +    iwork[3] = 0;
 +
 +    if (h__[iwork[12] + h_dim1] < 0.) {
 +      h__[iwork[12] + h_dim1] = -h__[iwork[12] + h_dim1];
 +      if (iwork[12] < *k + *np) {
 +         F77_FUNC(dscal,DSCAL)(n, &c_b50, &v[(iwork[12] + 1) * v_dim1 + 1], &c__1);
 +      } else {
 +         F77_FUNC(dscal,DSCAL)(n, &c_b50, &resid[1], &c__1);
 +      }
 +    }
 +
 +    ++iwork[12];
 +    if (iwork[12] > *k + *np) {
 +      *ido = 99;
 +
 +
 +      goto L9000;
 +    }
 +
 +    goto L1000;
 +
 +L9000:
 +    return;
 +}
 +
 +
 +
 +
 +
 +
 +static void 
 +F77_FUNC(dsaup2,DSAUP2)(int *     ido, 
 +                        const char *    bmat,
 +                        int *     n,
 +                        const char *    which, 
 +                        int *     nev, 
 +                        int *     np,
 +                        double *  tol, 
 +                        double *  resid, 
 +                        int *     mode, 
 +                        int *     iupd, 
 +                        int *     ishift, 
 +                        int *     mxiter, 
 +                        double *  v,
 +                        int *     ldv, 
 +                        double *  h__, 
 +                        int *     ldh, 
 +                        double *  ritz,
 +                        double *  bounds, 
 +                        double *  q, 
 +                        int *     ldq, 
 +                        double *  workl,
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        int *     iwork, 
 +                        int *     info)
 +{
 +    double c_b3 = 2/3;
 +    int c__1 = 1;
 +    int c__0 = 0;
 +    
 +    int h_dim1, h_offset, q_dim1, q_offset, v_dim1, v_offset, i__1, i__2, 
 +          i__3;
 +    double d__2, d__3;
 +    int j;
 +    double eps23;
 +    int ierr;
 +    double temp;
 +    int nevd2;
 +    int nevm2;
 +    int nevbef;
 +    char wprime[2];
 +    int nptemp;
 +
 +    --workd;
 +    --resid;
 +    --workl;
 +    --bounds;
 +    --ritz;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    q_dim1 = *ldq;
 +    q_offset = 1 + q_dim1;
 +    q -= q_offset;
 +    --ipntr;
 +    --iwork;
 +    eps23 = GMX_DOUBLE_EPS;
 +    eps23 = pow(eps23, c_b3);
 +
 +    if (*ido == 0) {
 +
 +      iwork[41] = 1;
 +      iwork[42] = 3;
 +      iwork[43] = 5;
 +      iwork[44] = 7;
 +
 +      iwork[9] = *nev;
 +      iwork[10] = *np;
 +
 +      iwork[7] = iwork[9] + iwork[10];
 +      iwork[8] = 0;
 +      iwork[6] = 0;
 +
 +      iwork[2] = 1;
 +      iwork[4] = 0;
 +      iwork[5] = 0;
 +      iwork[1] = 0;
 +
 +      if (*info != 0) {
 +
 +          iwork[3] = 1;
 +          *info = 0;
 +      } else {
 +          iwork[3] = 0;
 +      }
 +    }
 +
 +    if (iwork[2] == 1) {
 +      F77_FUNC(dgetv0,DGETV0)(ido, bmat, &c__1, &iwork[3], n, &c__1, &v[v_offset], ldv, &
 +                            resid[1], &workd[*n * 3 + 1], &ipntr[1], &workd[1], &iwork[41]
 +              , info);
 +
 +      if (*ido != 99) {
 +          goto L9000;
 +      }
 +
 +      if (workd[*n * 3 + 1] == 0.) {
 +
 +          *info = -9;
 +          goto L1200;
 +      }
 +      iwork[2] = 0;
 +      *ido = 0;
 +    }
 +
 +    if (iwork[4] == 1) {
 +      goto L20;
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L50;
 +    }
 +
 +    if (iwork[1] == 1) {
 +      goto L100;
 +    }
 +
 +    F77_FUNC(dsaitr,DSAITR)(ido, bmat, n, &c__0, &iwork[9], mode, &resid[1], &workd[*n * 3 + 
 +          1], &v[v_offset], ldv, &h__[h_offset], ldh, &ipntr[1], &workd[1], 
 +          &iwork[21], info);
 +
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    if (*info > 0) {
 +
 +      *np = *info;
 +      *mxiter = iwork[6];
 +      *info = -9999;
 +      goto L1200;
 +    }
 +
 +L1000:
 +
 +    ++iwork[6];
 +
 +
 +    *ido = 0;
 +L20:
 +    iwork[4] = 1;
 +
 +    F77_FUNC(dsaitr,DSAITR)(ido, bmat, n, nev, np, mode, &resid[1], &workd[*n * 3 + 1], 
 +                            &v[v_offset], ldv, &h__[h_offset], ldh, &ipntr[1], &workd[1], 
 +                            &iwork[21], info);
 +
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    if (*info > 0) {
 +
 +      *np = *info;
 +      *mxiter = iwork[6];
 +      *info = -9999;
 +      goto L1200;
 +    }
 +    iwork[4] = 0;
 +
 +    F77_FUNC(dseigt,DSEIGT)(&workd[*n * 3 + 1], &iwork[7], &h__[h_offset], ldh, &ritz[1], &
 +                            bounds[1], &workl[1], &ierr);
 +
 +    if (ierr != 0) {
 +      *info = -8;
 +      goto L1200;
 +    }
 +
 +   F77_FUNC(dcopy,DCOPY)(&iwork[7], &ritz[1], &c__1, &workl[iwork[7] + 1], &c__1);
 +   F77_FUNC(dcopy,DCOPY)(&iwork[7], &bounds[1], &c__1, &workl[(iwork[7] << 1) + 1], &c__1);
 +
 +   *nev = iwork[9];
 +   *np = iwork[10];
 +   F77_FUNC(dsgets,DSGETS)(ishift, which, nev, np, &ritz[1], &bounds[1], &workl[1]);
 +
 +   F77_FUNC(dcopy,DCOPY)(nev, &bounds[*np + 1], &c__1, &workl[*np + 1], &c__1);
 +   F77_FUNC(dsconv,DSCONV)(nev, &ritz[*np + 1], &workl[*np + 1], tol, &iwork[8]);
 +
 +    nptemp = *np;
 +    i__1 = nptemp;
 +    for (j = 1; j <= i__1; ++j) {
 +      if (bounds[j] == 0.) {
 +          --(*np);
 +          ++(*nev);
 +      }
 +    }
 +
 +    if (iwork[8] >= iwork[9] || iwork[6] > *mxiter || *np == 0) {
 +
 +      if (!strncmp(which, "BE", 2)) {
 +
 +      strncpy(wprime, "SA",2);
 +          F77_FUNC(dsortr,DSORTR)(wprime, &c__1, &iwork[7], &ritz[1], &bounds[1]);
 +          nevd2 = *nev / 2;
 +          nevm2 = *nev - nevd2;
 +          if (*nev > 1) {
 +            i__1 = (nevd2 < *np) ? nevd2 : *np;
 +            i__2 = iwork[7] - nevd2 + 1, i__3 = iwork[7] - *np + 1;
 +           F77_FUNC(dswap,DSWAP)(&i__1, &ritz[nevm2 + 1], &c__1, 
 +                   &ritz[((i__2>i__3) ? i__2 : i__3)], 
 +                   &c__1);
 +            i__1 = (nevd2 < *np) ? nevd2 : *np;
 +            i__2 = iwork[7] - nevd2 + 1, i__3 = iwork[7] - *np;
 +           F77_FUNC(dswap,DSWAP)(&i__1, &bounds[nevm2 + 1], &c__1, 
 +                   &bounds[((i__2>i__3) ? i__2 : i__3) + 1], 
 +                   &c__1);
 +          }
 +
 +      } else {
 +
 +      if (!strncmp(which, "LM", 2)) {
 +        strncpy(wprime, "SM", 2);
 +      }
 +      if (!strncmp(which, "SM", 2)) {
 +        strncpy(wprime, "LM", 2);
 +      }
 +      if (!strncmp(which, "LA", 2)) {
 +        strncpy(wprime, "SA", 2);
 +      }
 +      if (!strncmp(which, "SA", 2)) {
 +        strncpy(wprime, "LA", 2);
 +      }
 +      
 +      F77_FUNC(dsortr,DSORTR)(wprime, &c__1, &iwork[7], &ritz[1], &bounds[1]);
 +
 +      }
 +
 +      i__1 = iwork[9];
 +      for (j = 1; j <= i__1; ++j) {
 +        d__2 = eps23;
 +        d__3 = fabs(ritz[j]);
 +          temp = (d__2>d__3) ? d__2 : d__3;
 +          bounds[j] /= temp;
 +      }
 +
 +      strncpy(wprime, "LA",2);
 +      F77_FUNC(dsortr,DSORTR)(wprime, &c__1, &iwork[9], &bounds[1], &ritz[1]);
 +
 +      i__1 = iwork[9];
 +      for (j = 1; j <= i__1; ++j) {
 +        d__2 = eps23;
 +        d__3 = fabs(ritz[j]);
 +          temp = (d__2>d__3) ? d__2 : d__3;
 +          bounds[j] *= temp;
 +      }
 +
 +      if (!strncmp(which, "BE", 2)) {
 +
 +          strncpy(wprime, "LA", 2);
 +          F77_FUNC(dsortr,DSORTR)(wprime, &c__1, &iwork[8], &ritz[1], &bounds[1]);
 +
 +      } else {
 +        F77_FUNC(dsortr,DSORTR)(which, &c__1, &iwork[8], &ritz[1], &bounds[1]);
 +      
 +      }
 +
 +      h__[h_dim1 + 1] = workd[*n * 3 + 1];
 +
 +
 +      if (iwork[6] > *mxiter && iwork[8] < *nev) {
 +          *info = 1;
 +      }
 +      if (*np == 0 && iwork[8] < iwork[9]) {
 +          *info = 2;
 +      }
 +
 +      *np = iwork[8];
 +      goto L1100;
 +
 +    } else if (iwork[8] < *nev && *ishift == 1) {
 +      nevbef = *nev;
 +      i__1 = iwork[8], i__2 = *np / 2;
 +      *nev += (i__1 < i__2) ? i__1 : i__2;
 +      if (*nev == 1 && iwork[7] >= 6) {
 +          *nev = iwork[7] / 2;
 +      } else if (*nev == 1 && iwork[7] > 2) {
 +          *nev = 2;
 +      }
 +      *np = iwork[7] - *nev;
 +
 +
 +      if (nevbef < *nev) {
 +          F77_FUNC(dsgets,DSGETS)(ishift, which, nev, np, &ritz[1], &bounds[1], &workl[1]);
 +      }
 +
 +    }
 +
 +
 +    if (*ishift == 0) {
 +
 +      iwork[5] = 1;
 +      *ido = 3;
 +      goto L9000;
 +    }
 +
 +L50:
 +
 +    iwork[5] = 0;
 +
 +    if (*ishift == 0) {
 +      F77_FUNC(dcopy,DCOPY)(np, &workl[1], &c__1, &ritz[1], &c__1);
 +    }
 +
 +    F77_FUNC(dsapps,DSAPPS)(n, nev, np, &ritz[1], &v[v_offset], ldv, &h__[h_offset], ldh, &
 +          resid[1], &q[q_offset], ldq, &workd[1]);
 +
 +    iwork[1] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[*n + 1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(dcopy,DCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L100:
 +
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 1] =F77_FUNC(ddot,DDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      workd[*n * 3 + 1] = sqrt(fabs(workd[*n * 3 + 1]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 1] =F77_FUNC(dnrm2,DNRM2)(n, &resid[1], &c__1);
 +    }
 +    iwork[1] = 0;
 +
 +    goto L1000;
 +
 +L1100:
 +
 +    *mxiter = iwork[6];
 +    *nev = iwork[8];
 +
 +L1200:
 +    *ido = 99;
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +void 
 +F77_FUNC(dsaupd,DSAUPD)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        double *  tol, 
 +                        double *  resid, 
 +                        int *     ncv,
 +                        double *  v, 
 +                        int *     ldv, 
 +                        int *     iparam,
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        int *     iwork,
 +                        double *  workl, 
 +                        int *     lworkl,
 +                        int *     info)
 +{
 +    int v_dim1, v_offset, i__1, i__2;
 +    int j;
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --iparam;
 +    --ipntr;
 +    --iwork;
 +    --workl;
 +
 +    if (*ido == 0) {
 +
 +
 +      iwork[2] = 0;
 +      iwork[5] = iparam[1];
 +      iwork[10] = iparam[3];
 +      iwork[12] = iparam[4];
 +
 +      iwork[6] = 1;
 +      iwork[11] = iparam[7];
 +
 +
 +      if (*n <= 0) {
 +          iwork[2] = -1;
 +      } else if (*nev <= 0) {
 +          iwork[2] = -2;
 +      } else if (*ncv <= *nev || *ncv > *n) {
 +          iwork[2] = -3;
 +      }
 +
 +
 +      iwork[15] = *ncv - *nev;
 +
 +      if (iwork[10] <= 0) {
 +          iwork[2] = -4;
 +      }
 +      if (strncmp(which,"LM",2) && strncmp(which,"SM",2) && 
 +          strncmp(which,"LA",2) && strncmp(which,"SA",2) && 
 +          strncmp(which,"BE",2)) {
 +        iwork[2] = -5;
 +      }
 +      if (*bmat != 'I' && *bmat != 'G') {
 +          iwork[2] = -6;
 +      }
 +
 +      i__1 = *ncv;
 +      if (*lworkl < i__1 * i__1 + (*ncv << 3)) {
 +          iwork[2] = -7;
 +      }
 +      if (iwork[11] < 1 || iwork[11] > 5) {
 +          iwork[2] = -10;
 +      } else if (iwork[11] == 1 && *bmat == 'G') {
 +          iwork[2] = -11;
 +      } else if (iwork[5] < 0 || iwork[5] > 1) {
 +          iwork[2] = -12;
 +      } else if (*nev == 1 && !strncmp(which, "BE", 2)) {
 +          iwork[2] = -13;
 +      }
 +
 +      if (iwork[2] != 0) {
 +          *info = iwork[2];
 +          *ido = 99;
 +          goto L9000;
 +      }
 +
 +      if (iwork[12] <= 0) {
 +          iwork[12] = 1;
 +      }
 +      if (*tol <= 0.) {
 +        *tol = GMX_DOUBLE_EPS;
 +      }
 +
 +      iwork[15] = *ncv - *nev;
 +      iwork[13] = *nev;
 +      i__2 = *ncv;
 +      i__1 = i__2 * i__2 + (*ncv << 3);
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[j] = 0.;
 +      }
 +
 +      iwork[8] = *ncv;
 +      iwork[9] = *ncv;
 +      iwork[3] = 1;
 +      iwork[16] = iwork[3] + (iwork[8] << 1);
 +      iwork[1] = iwork[16] + *ncv;
 +      iwork[4] = iwork[1] + *ncv;
 +      i__1 = *ncv;
 +      iwork[7] = iwork[4] + i__1 * i__1;
 +      iwork[14] = iwork[7] + *ncv * 3;
 +
 +      ipntr[4] = iwork[14];
 +      ipntr[5] = iwork[3];
 +      ipntr[6] = iwork[16];
 +      ipntr[7] = iwork[1];
 +      ipntr[11] = iwork[7];
 +    }
 +
 +    F77_FUNC(dsaup2,DSAUP2)(ido, bmat, n, which, &iwork[13], &iwork[15], tol, &resid[1], &
 +          iwork[11], &iwork[6], &iwork[5], &iwork[10], &v[v_offset], ldv, &
 +          workl[iwork[3]], &iwork[8], &workl[iwork[16]], &workl[iwork[1]], &
 +          workl[iwork[4]], &iwork[9], &workl[iwork[7]], &ipntr[1], &workd[1]
 +          , &iwork[21], info);
 +
 +    if (*ido == 3) {
 +      iparam[8] = iwork[15];
 +    }
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    iparam[3] = iwork[10];
 +    iparam[5] = iwork[15];
 +
 +    if (*info < 0) {
 +      goto L9000;
 +    }
 +    if (*info == 2) {
 +      *info = 3;
 +    }
 +
 +L9000:
 +
 +    return;
 +
 +}
 +
 +
 +
 +void
 +F77_FUNC(dseupd,DSEUPD)(int *     rvec, 
 +                        const char *    howmny, 
 +                        int *     select, 
 +                        double *  d__, 
 +                        double *  z__, 
 +                        int *     ldz, 
 +                        double *  sigma, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        double *  tol, 
 +                        double *  resid, 
 +                        int *     ncv, 
 +                        double *  v,
 +                        int *     ldv, 
 +                        int *     iparam, 
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        double *  workl, 
 +                        int *     lworkl, 
 +                        int *     info)
 +{
 +    double c_b21 = 2/3;
 +    int c__1 = 1;
 +    double c_b102 = 1.;
 +    int v_dim1, v_offset, z_dim1, z_offset, i__1;
 +    double d__1, d__2, d__3;
 +
 +    int j, k, ih, iq, iw, ibd, ihb, ihd, ldh, ilg, ldq, ism, irz;
 +    int mode;
 +    double eps23;
 +    int ierr;
 +    double temp;
 +    int next;
 +    char type__[6];
 +    int ritz;
 +    int reord;
 +    int nconv;
 +    double rnorm;
 +    double bnorm2;
 +    double thres1=0, thres2=0;
 +    int bounds;
 +    int ktrord;
 +    double tempbnd;
 +    int leftptr, rghtptr;
 +
 +
 +    --workd;
 +    --resid;
 +    z_dim1 = *ldz;
 +    z_offset = 1 + z_dim1;
 +    z__ -= z_offset;
 +    --d__;
 +    --select;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --iparam;
 +    --ipntr;
 +    --workl;
 +
 +    mode = iparam[7];
 +    nconv = iparam[5];
 +    *info = 0;
 +
 +    if (nconv == 0) {
 +      goto L9000;
 +    }
 +    ierr = 0;
 +
 +    if (nconv <= 0) {
 +      ierr = -14;
 +    }
 +    if (*n <= 0) {
 +      ierr = -1;
 +    }
 +    if (*nev <= 0) {
 +      ierr = -2;
 +    }
 +    if (*ncv <= *nev || *ncv > *n) {
 +      ierr = -3;
 +    }
 +    if (strncmp(which,"LM",2) && strncmp(which,"SM",2) && 
 +      strncmp(which,"LA",2) && strncmp(which,"SA",2) && 
 +      strncmp(which,"BE",2)) {
 +      ierr = -5;
 +    }
 +    if (*bmat != 'I' && *bmat != 'G') {
 +      ierr = -6;
 +    }
 +    if (*howmny != 'A' && *howmny != 'P' && 
 +          *howmny != 'S' && *rvec) {
 +      ierr = -15;
 +    }
 +    if (*rvec && *howmny == 'S') {
 +      ierr = -16;
 +    }
 +    i__1 = *ncv;
 +    if (*rvec && *lworkl < i__1 * i__1 + (*ncv << 3)) {
 +      ierr = -7;
 +    }
 +
 +    if (mode == 1 || mode == 2) {
 +      strncpy(type__, "REGULR",6);
 +    } else if (mode == 3) {
 +      strncpy(type__, "SHIFTI",6);
 +    } else if (mode == 4) {
 +      strncpy(type__, "BUCKLE",6);
 +    } else if (mode == 5) {
 +      strncpy(type__, "CAYLEY",6);
 +    } else {
 +      ierr = -10;
 +    }
 +    if (mode == 1 && *bmat == 'G') {
 +      ierr = -11;
 +    }
 +    if (*nev == 1 && !strncmp(which, "BE",2)) {
 +      ierr = -12;
 +    }
 +
 +    if (ierr != 0) {
 +      *info = ierr;
 +      goto L9000;
 +    }
 +
 +    ih = ipntr[5];
 +    ritz = ipntr[6];
 +    bounds = ipntr[7];
 +    ldh = *ncv;
 +    ldq = *ncv;
 +    ihd = bounds + ldh;
 +    ihb = ihd + ldh;
 +    iq = ihb + ldh;
 +    iw = iq + ldh * *ncv;
 +    next = iw + (*ncv << 1);
 +    ipntr[4] = next;
 +    ipntr[8] = ihd;
 +    ipntr[9] = ihb;
 +    ipntr[10] = iq;
 +
 +    irz = ipntr[11] + *ncv;
 +    ibd = irz + *ncv;
 +
 +
 +    eps23 = GMX_DOUBLE_EPS;
 +    eps23 = pow(eps23, c_b21);
 +
 +    rnorm = workl[ih];
 +    if (*bmat == 'I') {
 +      bnorm2 = rnorm;
 +    } else if (*bmat == 'G') {
 +      bnorm2 =F77_FUNC(dnrm2,DNRM2)(n, &workd[1], &c__1);
 +    }
 +
 +    if (*rvec) {
 +
 +        if (!strncmp(which,"LM",2) || !strncmp(which,"SM",2) ||
 +            !strncmp(which,"LA",2) || !strncmp(which,"SA",2)) {
 + 
 +      } else if (!strncmp(which,"BE",2)) {
 +
 +
 +        ism = (*nev>nconv) ? *nev : nconv;
 +        ism /= 2;
 +        ilg = ism + 1;
 +        thres1 = workl[ism];
 +        thres2 = workl[ilg];
 +
 +
 +      }
 +
 +      reord = 0;
 +      ktrord = 0;
 +      i__1 = *ncv - 1;
 +      for (j = 0; j <= i__1; ++j) {
 +          select[j + 1] = 0;
 +          if (!strncmp(which,"LM",2)) {
 +              if (fabs(workl[irz + j]) >= fabs(thres1)) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                if (workl[ibd + j] <= *tol * tempbnd) {
 +                  select[j + 1] = 1;
 +                }
 +              }
 +          } else if (!strncmp(which,"SM",2)) {
 +              if (fabs(workl[irz + j]) <= fabs(thres1)) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"LA",2)) {
 +              if (workl[irz + j] >= thres1) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"SA",2)) {
 +              if (workl[irz + j] <= thres1) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"BE",2)) {
 +              if (workl[irz + j] <= thres1 || workl[irz + j] >= thres2) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          }
 +          if (j + 1 > nconv) {
 +              reord = select[j + 1] || reord;
 +          }
 +          if (select[j + 1]) {
 +              ++ktrord;
 +          }
 +      }
 +
 +      i__1 = *ncv - 1;
 +      F77_FUNC(dcopy,DCOPY)(&i__1, &workl[ih + 1], &c__1, &workl[ihb], &c__1);
 +      F77_FUNC(dcopy,DCOPY)(ncv, &workl[ih + ldh], &c__1, &workl[ihd], &c__1);
 +
 +      F77_FUNC(dsteqr,DSTEQR)("Identity", ncv, &workl[ihd], &workl[ihb], &workl[iq], &ldq, &
 +              workl[iw], &ierr);
 +
 +      if (ierr != 0) {
 +          *info = -8;
 +          goto L9000;
 +      }
 +
 +
 +      if (reord) {
 +
 +          leftptr = 1;
 +          rghtptr = *ncv;
 +
 +          if (*ncv == 1) {
 +              goto L30;
 +          }
 +
 +L20:
 +          if (select[leftptr]) {
 +
 +              ++leftptr;
 +
 +          } else if (! select[rghtptr]) {
 +
 +              --rghtptr;
 +
 +          } else {
 +
 +              temp = workl[ihd + leftptr - 1];
 +              workl[ihd + leftptr - 1] = workl[ihd + rghtptr - 1];
 +              workl[ihd + rghtptr - 1] = temp;
 +              F77_FUNC(dcopy,DCOPY)(ncv, &workl[iq + *ncv * (leftptr - 1)], &c__1, &workl[
 +                      iw], &c__1);
 +              F77_FUNC(dcopy,DCOPY)(ncv, &workl[iq + *ncv * (rghtptr - 1)], &c__1, &workl[
 +                      iq + *ncv * (leftptr - 1)], &c__1);
 +              F77_FUNC(dcopy,DCOPY)(ncv, &workl[iw], &c__1, &workl[iq + *ncv * (rghtptr - 
 +                      1)], &c__1);
 +              ++leftptr;
 +              --rghtptr;
 +
 +          }
 +
 +          if (leftptr < rghtptr) {
 +              goto L20;
 +          }
 +
 +L30:
 +          ;
 +      }
 +
 +      F77_FUNC(dcopy,DCOPY)(&nconv, &workl[ihd], &c__1, &d__[1], &c__1);
 +
 +    } else {
 +
 +      F77_FUNC(dcopy,DCOPY)(&nconv, &workl[ritz], &c__1, &d__[1], &c__1);
 +      F77_FUNC(dcopy,DCOPY)(ncv, &workl[ritz], &c__1, &workl[ihd], &c__1);
 +
 +    }
 +    if (!strncmp(type__, "REGULR",6)) {
 +
 +      if (*rvec) {
 +          F77_FUNC(dsesrt,DSESRT)("LA", rvec, &nconv, &d__[1], ncv, &workl[iq], &ldq);
 +      } else {
 +         F77_FUNC(dcopy,DCOPY)(ncv, &workl[bounds], &c__1, &workl[ihb], &c__1);
 +      }
 +
 +    } else {
 +
 +      F77_FUNC(dcopy,DCOPY)(ncv, &workl[ihd], &c__1, &workl[iw], &c__1);
 +      if (!strncmp(type__, "SHIFTI", 6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = 1. / workl[ihd + k - 1] + *sigma;
 +          }
 +      } else if (!strncmp(type__, "BUCKLE",6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = *sigma * workl[ihd + k - 1] / (workl[ihd 
 +                      + k - 1] - 1.);
 +          }
 +      } else if (!strncmp(type__, "CAYLEY",6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = *sigma * (workl[ihd + k - 1] + 1.) / (
 +                      workl[ihd + k - 1] - 1.);
 +          }
 +      }
 +
 +      F77_FUNC(dcopy,DCOPY)(&nconv, &workl[ihd], &c__1, &d__[1], &c__1);
 +      F77_FUNC(dsortr,DSORTR)("LA", &c__1, &nconv, &workl[ihd], &workl[iw]);
 +      if (*rvec) {
 +          F77_FUNC(dsesrt,DSESRT)("LA", rvec, &nconv, &d__[1], ncv, &workl[iq], &ldq);
 +      } else {
 +         F77_FUNC(dcopy,DCOPY)(ncv, &workl[bounds], &c__1, &workl[ihb], &c__1);
 +          d__1 = bnorm2 / rnorm;
 +         F77_FUNC(dscal,DSCAL)(ncv, &d__1, &workl[ihb], &c__1);
 +          F77_FUNC(dsortr,DSORTR)("LA", &c__1, &nconv, &d__[1], &workl[ihb]);
 +      }
 +
 +    }
 +
 +    if (*rvec && *howmny == 'A') {
 +
 +      F77_FUNC(dgeqr2,DGEQR2)(ncv, &nconv, &workl[iq], &ldq, &workl[iw + *ncv], &workl[ihb],
 +               &ierr);
 +
 +      F77_FUNC(dorm2r,DORM2R)("Right", "Notranspose", n, ncv, &nconv, &workl[iq], &ldq, &
 +              workl[iw + *ncv], &v[v_offset], ldv, &workd[*n + 1], &ierr);
 +      F77_FUNC(dlacpy,DLACPY)("All", n, &nconv, &v[v_offset], ldv, &z__[z_offset], ldz);
 +
 +      i__1 = *ncv - 1;
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[ihb + j - 1] = 0.;
 +      }
 +      workl[ihb + *ncv - 1] = 1.;
 +      F77_FUNC(dorm2r,DORM2R)("Left", "Transpose", ncv, &c__1, &nconv, &workl[iq], &ldq, &
 +              workl[iw + *ncv], &workl[ihb], ncv, &temp, &ierr);
 +
 +    } else if (*rvec && *howmny == 'S') {
 +
 +    }
 +
 +    if (!strncmp(type__, "REGULR",6) && *rvec) {
 +
 +      i__1 = *ncv;
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[ihb + j - 1] = rnorm * fabs(workl[ihb + j - 1]);
 +      }
 +
 +    } else if (strncmp(type__, "REGULR",6) && *rvec) {
 +
 +      F77_FUNC(dscal,DSCAL)(ncv, &bnorm2, &workl[ihb], &c__1);
 +      if (!strncmp(type__, "SHIFTI",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              d__2 = workl[iw + k - 1];
 +              workl[ihb + k - 1] = fabs(workl[ihb + k - 1])/(d__2 * d__2);
 +          }
 +
 +      } else if (!strncmp(type__, "BUCKLE",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              d__2 = workl[iw + k - 1] - 1.;
 +              workl[ihb + k - 1] = *sigma * fabs(workl[ihb + k - 1])/(d__2 * d__2);
 +          }
 +
 +      } else if (!strncmp(type__, "CAYLEY",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +            workl[ihb + k - 1] = fabs(workl[ihb + k - 1] / workl[iw + k - 1] * (workl[iw + k - 1] - 1.));
 +            
 +          }
 +
 +      }
 +
 +    }
 +
 +    if (*rvec && (!strncmp(type__, "SHIFTI",6) || !strncmp(type__, "CAYLEY",6))) {
 +
 +      i__1 = nconv - 1;
 +      for (k = 0; k <= i__1; ++k) {
 +          workl[iw + k] = workl[iq + k * ldq + *ncv - 1] / workl[iw + k];
 +      }
 +
 +    } else if (*rvec && !strncmp(type__, "BUCKLE", 6)) {
 +
 +      i__1 = nconv - 1;
 +      for (k = 0; k <= i__1; ++k) {
 +          workl[iw + k] = workl[iq + k * ldq + *ncv - 1] / (workl[iw + k] - 
 +                  1.);
 +      }
 +
 +    }
 +
 +    if (strncmp(type__, "REGULR",6)) {
 +      F77_FUNC(dger,DGER)(n, &nconv, &c_b102, &resid[1], &c__1, &workl[iw], &c__1, &z__[
 +              z_offset], ldz);
 +    }
 +
 +L9000:
 +
 +    return;
 +
 +}
 +
 +
 +
 +
 +
 +/* Selected single precision arpack routines */
 +
 +
 +static void 
 +F77_FUNC(sstqrb,SSTQRB)(int *      n, 
 +                        float *   d__, 
 +                        float *   e, 
 +                        float *   z__, 
 +                        float *   work, 
 +                        int *      info)
 +{
 +    int i__1, i__2;
 +    float d__1, d__2;
 +    int c__0 = 0;
 +    int c__1 = 1;
 +    float c_b31 = 1.;
 +
 +    float b, c__, f, g;
 +    int i__, j, k, l, m;
 +    float p, r__, s;
 +    int l1, ii, mm, lm1, mm1, nm1;
 +    float rt1, rt2, eps;
 +    int lsv;
 +    float tst, eps2;
 +    int lend, jtot, lendm1, lendp1, iscale;
 +
 +    int lendsv, nmaxit, icompz;
 +    float ssfmax, ssfmin,safmin,minval,safmax,anorm;
 +
 +
 +    --work;
 +    --z__;
 +    --e;
 +    --d__;
 +
 +    *info = 0;
 +
 +    icompz = 2;
 +
 +    if (*n == 0) {
 +      return;
 +    }
 +
 +    if (*n == 1) {
 +      if (icompz == 2) {
 +          z__[1] = 1.;
 +      }
 +      return;
 +    }
 +
 +    eps = GMX_FLOAT_EPS;
 +
 +    d__1 = eps;
 +    eps2 = d__1 * d__1;
 +    minval = GMX_FLOAT_MIN;
 +    safmin = minval / GMX_FLOAT_EPS;
 +    safmax = 1. / safmin;
 +    ssfmax = sqrt(safmax) / 3.;
 +    ssfmin = sqrt(safmin) / eps2;
 +
 +    if (icompz == 2) {
 +      i__1 = *n - 1;
 +      for (j = 1; j <= i__1; ++j) {
 +          z__[j] = 0.;
 +
 +      }
 +      z__[*n] = 1.;
 +    }
 +
 +    nmaxit = *n * 30;
 +    jtot = 0;
 +
 +    l1 = 1;
 +    nm1 = *n - 1;
 +
 +L10:
 +    if (l1 > *n) {
 +      goto L160;
 +    }
 +    if (l1 > 1) {
 +      e[l1 - 1] = 0.;
 +    }
 +    if (l1 <= nm1) {
 +      i__1 = nm1;
 +      for (m = l1; m <= i__1; ++m) {
 +        tst = fabs(e[m]);
 +          if (tst == 0.) {
 +              goto L30;
 +          }
 +          if (tst <= sqrt(fabs(d__[m])) * sqrt(fabs(d__[m+1])) * eps) {
 +            e[m] = 0.;
 +            goto L30;
 +          }
 +      }
 +    }
 +    m = *n;
 +
 +L30:
 +    l = l1;
 +    lsv = l;
 +    lend = m;
 +    lendsv = lend;
 +    l1 = m + 1;
 +    if (lend == l) {
 +      goto L10;
 +    }
 +
 +    i__1 = lend - l + 1;
 +    anorm =F77_FUNC(slanst,SLANST)("i", &i__1, &d__[l], &e[l]);
 +    iscale = 0;
 +    if (anorm == 0.) {
 +      goto L10;
 +    }
 +    if (anorm > ssfmax) {
 +      iscale = 1;
 +      i__1 = lend - l + 1;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n, 
 +              info);
 +      i__1 = lend - l;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n, 
 +              info);
 +    } else if (anorm < ssfmin) {
 +      iscale = 2;
 +      i__1 = lend - l + 1;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n, 
 +              info);
 +      i__1 = lend - l;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n, 
 +              info);
 +    }
 +
 +    if (fabs(d__[lend]) < fabs(d__[l])) {
 +      lend = lsv;
 +      l = lendsv;
 +    }
 +
 +    if (lend > l) {
 +
 +L40:
 +      if (l != lend) {
 +          lendm1 = lend - 1;
 +          i__1 = lendm1;
 +          for (m = l; m <= i__1; ++m) {
 +              d__2 = fabs(e[m]);
 +              tst = d__2 * d__2;
 +              if (tst <= eps2 * fabs(d__[m]) * fabs(d__[m + 1]) + safmin) {
 +                  goto L60;
 +              }
 +          }
 +      }
 +
 +      m = lend;
 +
 +L60:
 +      if (m < lend) {
 +          e[m] = 0.;
 +      }
 +      p = d__[l];
 +      if (m == l) {
 +          goto L80;
 +      }
 +
 +      if (m == l + 1) {
 +          if (icompz > 0) {
 +              F77_FUNC(slaev2,SLAEV2)(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
 +              work[l] = c__;
 +              work[*n - 1 + l] = s;
 +
 +              tst = z__[l + 1];
 +              z__[l + 1] = c__ * tst - s * z__[l];
 +              z__[l] = s * tst + c__ * z__[l];
 +          } else {
 +              F77_FUNC(slae2,SLAE2)(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
 +          }
 +          d__[l] = rt1;
 +          d__[l + 1] = rt2;
 +          e[l] = 0.;
 +          l += 2;
 +          if (l <= lend) {
 +              goto L40;
 +          }
 +          goto L140;
 +      }
 +
 +      if (jtot == nmaxit) {
 +          goto L140;
 +      }
 +      ++jtot;
 +
 +      g = (d__[l + 1] - p) / (e[l] * 2.);
 +      r__ =F77_FUNC(slapy2,SLAPY2)(&g, &c_b31);
 +      g = d__[m] - p + e[l] / (g + ((g>0) ? r__ : -r__ ));
 +
 +      s = 1.;
 +      c__ = 1.;
 +      p = 0.;
 +
 +      mm1 = m - 1;
 +      i__1 = l;
 +      for (i__ = mm1; i__ >= i__1; --i__) {
 +          f = s * e[i__];
 +          b = c__ * e[i__];
 +         F77_FUNC(slartg,SLARTG)(&g, &f, &c__, &s, &r__);
 +          if (i__ != m - 1) {
 +              e[i__ + 1] = r__;
 +          }
 +          g = d__[i__ + 1] - p;
 +          r__ = (d__[i__] - g) * s + c__ * 2. * b;
 +          p = s * r__;
 +          d__[i__ + 1] = g + p;
 +          g = c__ * r__ - b;
 +
 +          if (icompz > 0) {
 +              work[i__] = c__;
 +              work[*n - 1 + i__] = -s;
 +          }
 +
 +      }
 +
 +      if (icompz > 0) {
 +          mm = m - l + 1;
 +
 +         F77_FUNC(slasr,SLASR)("r", "v", "b", &c__1, &mm, &work[l], &work[*n - 1 + l], &
 +                  z__[l], &c__1);
 +      }
 +
 +      d__[l] -= p;
 +      e[l] = g;
 +      goto L40;
 +
 +L80:
 +      d__[l] = p;
 +
 +      ++l;
 +      if (l <= lend) {
 +          goto L40;
 +      }
 +      goto L140;
 +
 +    } else {
 +
 +L90:
 +      if (l != lend) {
 +          lendp1 = lend + 1;
 +          i__1 = lendp1;
 +          for (m = l; m >= i__1; --m) {
 +              d__2 = fabs(e[m - 1]);
 +              tst = d__2 * d__2;
 +              if (tst <= eps2 * fabs(d__[m]) * fabs(d__[m- 1]) + safmin) {
 +                  goto L110;
 +              }
 +          }
 +      }
 +
 +      m = lend;
 +
 +L110:
 +      if (m > lend) {
 +          e[m - 1] = 0.;
 +      }
 +      p = d__[l];
 +      if (m == l) {
 +          goto L130;
 +      }
 +
 +      if (m == l - 1) {
 +          if (icompz > 0) {
 +              F77_FUNC(slaev2,SLAEV2)(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
 +                      ;
 +
 +              tst = z__[l];
 +              z__[l] = c__ * tst - s * z__[l - 1];
 +              z__[l - 1] = s * tst + c__ * z__[l - 1];
 +          } else {
 +              F77_FUNC(slae2,SLAE2)(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
 +          }
 +          d__[l - 1] = rt1;
 +          d__[l] = rt2;
 +          e[l - 1] = 0.;
 +          l += -2;
 +          if (l >= lend) {
 +              goto L90;
 +          }
 +          goto L140;
 +      }
 +
 +      if (jtot == nmaxit) {
 +          goto L140;
 +      }
 +      ++jtot;
 +
 +
 +      g = (d__[l - 1] - p) / (e[l - 1] * 2.);
 +      r__ =F77_FUNC(slapy2,SLAPY2)(&g, &c_b31);
 +      g = d__[m] - p + e[l - 1] / (g + ((g>0) ? r__ : -r__ ));
 +
 +      s = 1.;
 +      c__ = 1.;
 +      p = 0.;
 +
 +      lm1 = l - 1;
 +      i__1 = lm1;
 +      for (i__ = m; i__ <= i__1; ++i__) {
 +          f = s * e[i__];
 +          b = c__ * e[i__];
 +         F77_FUNC(slartg,SLARTG)(&g, &f, &c__, &s, &r__);
 +          if (i__ != m) {
 +              e[i__ - 1] = r__;
 +          }
 +          g = d__[i__] - p;
 +          r__ = (d__[i__ + 1] - g) * s + c__ * 2. * b;
 +          p = s * r__;
 +          d__[i__] = g + p;
 +          g = c__ * r__ - b;
 +
 +          if (icompz > 0) {
 +              work[i__] = c__;
 +              work[*n - 1 + i__] = s;
 +          }
 +
 +      }
 +
 +      if (icompz > 0) {
 +          mm = l - m + 1;
 +
 +         F77_FUNC(slasr,SLASR)("r", "v", "f", &c__1, &mm, &work[m], &work[*n - 1 + m], &
 +                  z__[m], &c__1);
 +      }
 +
 +      d__[l] -= p;
 +      e[lm1] = g;
 +      goto L90;
 +
 +L130:
 +      d__[l] = p;
 +
 +      --l;
 +      if (l >= lend) {
 +          goto L90;
 +      }
 +      goto L140;
 +
 +    }
 +
 +L140:
 +    if (iscale == 1) {
 +      i__1 = lendsv - lsv + 1;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv], 
 +              n, info);
 +      i__1 = lendsv - lsv;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n, 
 +              info);
 +    } else if (iscale == 2) {
 +      i__1 = lendsv - lsv + 1;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv], 
 +              n, info);
 +      i__1 = lendsv - lsv;
 +      F77_FUNC(slascl,SLASCL)("g", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n, 
 +              info);
 +    }
 +
 +    if (jtot < nmaxit) {
 +      goto L10;
 +    }
 +    i__1 = *n - 1;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +      if (e[i__] != 0.) {
 +          ++(*info);
 +      }
 +    }
 +    goto L190;
 +
 +L160:
 +    if (icompz == 0) {
 +
 +      F77_FUNC(slasrt,SLASRT)("i", n, &d__[1], info);
 +
 +    } else {
 +
 +      i__1 = *n;
 +      for (ii = 2; ii <= i__1; ++ii) {
 +          i__ = ii - 1;
 +          k = i__;
 +          p = d__[i__];
 +          i__2 = *n;
 +          for (j = ii; j <= i__2; ++j) {
 +              if (d__[j] < p) {
 +                  k = j;
 +                  p = d__[j];
 +              }
 +          }
 +          if (k != i__) {
 +              d__[k] = d__[i__];
 +              d__[i__] = p;
 +
 +              p = z__[k];
 +              z__[k] = z__[i__];
 +              z__[i__] = p;
 +          }
 +      }
 +    }
 +
 +L190:
 +    return;
 +
 +}
 +
 +static void 
 +F77_FUNC(sgetv0,SGETV0)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     itry, 
 +                        int *     initv, 
 +                        int *     n, 
 +                        int *     j, 
 +                        float *  v, 
 +                        int *     ldv, 
 +                        float *  resid, 
 +                        float *  rnorm, 
 +                        int *     ipntr, 
 +                        float *  workd, 
 +                        int *     iwork, 
 +                        int *     ierr)
 +{
 +    int c__1 = 1;
 +    float c_b22 = 1.;
 +    float c_b24 = 0.;
 +    float c_b27 = -1.;
 +    int v_dim1, v_offset, i__1;
 +
 +    int jj;
 +    int idist;
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --ipntr;
 +    --iwork;
 +
 +    if (*ido == 0) {
 +
 +      *ierr = 0;
 +      iwork[7] = 0;
 +      iwork[5] = 0;
 +      iwork[6] = 0;
 +
 +      if (! (*initv)) {
 +          idist = 2;
 +        F77_FUNC(slarnv,SLARNV)(&idist, &iwork[1], n, &resid[1]);
 +      }
 +
 +      if (*bmat == 'G') {
 +          ipntr[1] = 1;
 +          ipntr[2] = *n + 1;
 +        F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +          *ido = -1;
 +          goto L9000;
 +      }
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L20;
 +    }
 +
 +    if (iwork[6] == 1) {
 +      goto L40;
 +    }
 +
 +    iwork[5] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(scopy,SCOPY)(n, &workd[*n + 1], &c__1, &resid[1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L20:
 +
 +
 +    iwork[5] = 0;
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 4] =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      workd[*n * 3 + 4] = sqrt(fabs(workd[*n * 3 + 4]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 4] =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +    *rnorm = workd[*n * 3 + 4];
 +
 +    if (*j == 1) {
 +      goto L50;
 +    }
 +    iwork[6] = 1;
 +L30:
 +
 +    i__1 = *j - 1;
 +    F77_FUNC(sgemv,SGEMV)("T", n, &i__1, &c_b22, &v[v_offset], ldv, &workd[1], &c__1, &c_b24,
 +                          &workd[*n + 1], &c__1);
 +    i__1 = *j - 1;
 +    F77_FUNC(sgemv,SGEMV)("N", n, &i__1, &c_b27, &v[v_offset], ldv, &workd[*n + 1], &c__1, &
 +                          c_b22, &resid[1], &c__1);
 +
 +    if (*bmat == 'G') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[*n + 1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L40:
 +
 +    if (*bmat == 'G') {
 +      *rnorm =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      *rnorm = sqrt(fabs(*rnorm));
 +    } else if (*bmat == 'I') {
 +      *rnorm =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*rnorm > workd[*n * 3 + 4] * .717f) {
 +      goto L50;
 +    }
 +
 +    ++iwork[7];
 +    if (iwork[7] <= 1) {
 +
 +      workd[*n * 3 + 4] = *rnorm;
 +      goto L30;
 +    } else {
 +
 +      i__1 = *n;
 +      for (jj = 1; jj <= i__1; ++jj) {
 +          resid[jj] = 0.;
 +      }
 +      *rnorm = 0.;
 +      *ierr = -1;
 +    }
 +
 +L50:
 +
 +    *ido = 99;
 +
 +L9000:
 +    return;
 +}
 +
 +
 +
 +
 +
 +static void 
 +F77_FUNC(ssapps,SSAPPS)(int *     n, 
 +                        int *     kev, 
 +                        int *     np, 
 +                        float *  shift, 
 +                        float *  v, 
 +                        int *     ldv, 
 +                        float *  h__, 
 +                        int *     ldh, 
 +                        float *  resid, 
 +                        float *  q, 
 +                        int *     ldq, 
 +                        float *  workd)
 +{
 +    float c_b4 = 0.;
 +    float c_b5 = 1.;
 +    float c_b14 = -1.;
 +    int c__1 = 1;
 +    int h_dim1, h_offset, q_dim1, q_offset, v_dim1, v_offset, i__1, i__2, 
 +          i__3, i__4;
 +    float c__, f, g;
 +    int i__, j;
 +    float r__, s, a1, a2, a3, a4;
 +    int jj;
 +    float big;
 +    int iend, itop;
 +    float epsmch;
 +    int istart, kplusp;
 +
 +    --workd;
 +    --resid;
 +    --shift;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    q_dim1 = *ldq;
 +    q_offset = 1 + q_dim1;
 +    q -= q_offset;
 +
 +    epsmch = GMX_FLOAT_EPS;
 +    itop = 1;
 +
 +
 +    kplusp = *kev + *np;
 +
 +   F77_FUNC(slaset,SLASET)("All", &kplusp, &kplusp, &c_b4, &c_b5, &q[q_offset], ldq);
 +
 +    if (*np == 0) {
 +      goto L9000;
 +    }
 +
 +    i__1 = *np;
 +    for (jj = 1; jj <= i__1; ++jj) {
 +
 +      istart = itop;
 +
 +L20:
 +
 +      i__2 = kplusp - 1;
 +      for (i__ = istart; i__ <= i__2; ++i__) {
 +        big = fabs(h__[i__ + (h_dim1*2)]) + fabs(h__[i__ + 1 + (h_dim1*2)]);
 +        if (h__[i__ + 1 + h_dim1] <= epsmch * big) {
 +          h__[i__ + 1 + h_dim1] = 0.;
 +          iend = i__;
 +          goto L40;
 +        }
 +      }
 +      iend = kplusp;
 +L40:
 +
 +      if (istart < iend) {
 +
 +          f = h__[istart + (h_dim1 << 1)] - shift[jj];
 +          g = h__[istart + 1 + h_dim1];
 +         F77_FUNC(slartg,SLARTG)(&f, &g, &c__, &s, &r__);
 +
 +          a1 = c__ * h__[istart + (h_dim1 << 1)] + s * h__[istart + 1 + 
 +                  h_dim1];
 +          a2 = c__ * h__[istart + 1 + h_dim1] + s * h__[istart + 1 + (
 +                  h_dim1 << 1)];
 +          a4 = c__ * h__[istart + 1 + (h_dim1 << 1)] - s * h__[istart + 1 + 
 +                  h_dim1];
 +          a3 = c__ * h__[istart + 1 + h_dim1] - s * h__[istart + (h_dim1 << 
 +                  1)];
 +          h__[istart + (h_dim1 << 1)] = c__ * a1 + s * a2;
 +          h__[istart + 1 + (h_dim1 << 1)] = c__ * a4 - s * a3;
 +          h__[istart + 1 + h_dim1] = c__ * a3 + s * a4;
 +
 +          i__3 = istart + jj;
 +          i__2 = (i__3<kplusp) ? i__3 : kplusp;
 +          for (j = 1; j <= i__2; ++j) {
 +              a1 = c__ * q[j + istart * q_dim1] + s * q[j + (istart + 1) * 
 +                      q_dim1];
 +              q[j + (istart + 1) * q_dim1] = -s * q[j + istart * q_dim1] + 
 +                      c__ * q[j + (istart + 1) * q_dim1];
 +              q[j + istart * q_dim1] = a1;
 +
 +          }
 +
 +          i__2 = iend - 1;
 +          for (i__ = istart + 1; i__ <= i__2; ++i__) {
 +
 +              f = h__[i__ + h_dim1];
 +              g = s * h__[i__ + 1 + h_dim1];
 +
 +              h__[i__ + 1 + h_dim1] = c__ * h__[i__ + 1 + h_dim1];
 +              F77_FUNC(slartg,SLARTG)(&f, &g, &c__, &s, &r__);
 +
 +              if (r__ < 0.) {
 +                  r__ = -r__;
 +                  c__ = -c__;
 +                  s = -s;
 +              }
 +
 +              h__[i__ + h_dim1] = r__;
 +
 +              a1 = c__ * h__[i__ + (h_dim1 << 1)] + s * h__[i__ + 1 + 
 +                      h_dim1];
 +              a2 = c__ * h__[i__ + 1 + h_dim1] + s * h__[i__ + 1 + (h_dim1 
 +                      << 1)];
 +              a3 = c__ * h__[i__ + 1 + h_dim1] - s * h__[i__ + (h_dim1 << 1)
 +                      ];
 +              a4 = c__ * h__[i__ + 1 + (h_dim1 << 1)] - s * h__[i__ + 1 + 
 +                      h_dim1];
 +
 +              h__[i__ + (h_dim1 << 1)] = c__ * a1 + s * a2;
 +              h__[i__ + 1 + (h_dim1 << 1)] = c__ * a4 - s * a3;
 +              h__[i__ + 1 + h_dim1] = c__ * a3 + s * a4;
 +
 +              i__4 = j + jj;
 +              i__3 = (i__4<kplusp) ? i__4 : kplusp;
 +              for (j = 1; j <= i__3; ++j) {
 +                  a1 = c__ * q[j + i__ * q_dim1] + s * q[j + (i__ + 1) * 
 +                          q_dim1];
 +                  q[j + (i__ + 1) * q_dim1] = -s * q[j + i__ * q_dim1] + 
 +                          c__ * q[j + (i__ + 1) * q_dim1];
 +                  q[j + i__ * q_dim1] = a1;
 +              }
 +
 +          }
 +
 +      }
 +
 +      istart = iend + 1;
 +
 +      if (h__[iend + h_dim1] < 0.) {
 +          h__[iend + h_dim1] = -h__[iend + h_dim1];
 +         F77_FUNC(sscal,SSCAL)(&kplusp, &c_b14, &q[iend * q_dim1 + 1], &c__1);
 +      }
 +
 +      if (iend < kplusp) {
 +          goto L20;
 +      }
 +
 +      i__2 = kplusp - 1;
 +      for (i__ = itop; i__ <= i__2; ++i__) {
 +          if (h__[i__ + 1 + h_dim1] > 0.) {
 +              goto L90;
 +          }
 +          ++itop;
 +      }
 +
 +L90:
 +      ;
 +    }
 +
 +    i__1 = kplusp - 1;
 +    for (i__ = itop; i__ <= i__1; ++i__) {
 +      big = fabs(h__[i__ + (h_dim1*2)]) + fabs(h__[i__+ 1 + (h_dim1*2)]);
 +      if (h__[i__ + 1 + h_dim1] <= epsmch * big) {
 +          h__[i__ + 1 + h_dim1] = 0.;
 +      }
 +
 +    }
 +
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(sgemv,SGEMV)("N", n, &kplusp, &c_b5, &v[v_offset], ldv, &q[(*kev + 1) * 
 +              q_dim1 + 1], &c__1, &c_b4, &workd[*n + 1], &c__1);
 +    }
 +
 +    i__1 = *kev;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +      i__2 = kplusp - i__ + 1;
 +      F77_FUNC(sgemv,SGEMV)("N", n, &i__2, &c_b5, &v[v_offset], ldv, &q[(*kev - i__ + 1) * 
 +              q_dim1 + 1], &c__1, &c_b4, &workd[1], &c__1);
 +      F77_FUNC(scopy,SCOPY)(n, &workd[1], &c__1, &v[(kplusp - i__ + 1) * v_dim1 + 1], &
 +              c__1);
 +
 +    }
 +
 +   F77_FUNC(slacpy,SLACPY)("All", n, kev, &v[(*np + 1) * v_dim1 + 1], ldv, &v[v_offset], ldv);
 +
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(scopy,SCOPY)(n, &workd[*n + 1], &c__1, &v[(*kev + 1) * v_dim1 + 1], &c__1);
 +    }
 +
 +   F77_FUNC(sscal,SSCAL)(n, &q[kplusp + *kev * q_dim1], &resid[1], &c__1);
 +    if (h__[*kev + 1 + h_dim1] > 0.) {
 +      F77_FUNC(saxpy,SAXPY)(n, &h__[*kev + 1 + h_dim1], &v[(*kev + 1) * v_dim1 + 1], &c__1,
 +               &resid[1], &c__1);
 +    }
 +
 +
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +static void 
 +F77_FUNC(ssortr,SSORTR)(const char *    which, 
 +                        int *     apply, 
 +                        int *     n, 
 +                        float *  x1, 
 +                        float *  x2)
 +{
 +    int i__1;
 +
 +    int i__, j, igap;
 +    float temp;
 +
 +
 +
 +    igap = *n / 2;
 +
 +    if (!strncmp(which, "SA", 2)) {
 +
 +L10:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L20:
 +
 +          if (j < 0) {
 +              goto L30;
 +          }
 +
 +          if (x1[j] < x1[j + igap]) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap]; 
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L30;
 +          }
 +          j -= igap;
 +          goto L20;
 +L30:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L10;
 +
 +    } else if (!strncmp(which, "SM", 2)) {
 +
 +L40:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L50:
 +
 +          if (j < 0) {
 +              goto L60;
 +          }
 +
 +          if (fabs(x1[j]) < fabs(x1[j + igap])) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L60;
 +          }
 +          j -= igap;
 +          goto L50;
 +L60:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L40;
 +
 +    } else if (!strncmp(which, "LA", 2)) {
 +
 +L70:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L80:
 +
 +          if (j < 0) {
 +              goto L90;
 +          }
 +
 +          if (x1[j] > x1[j + igap]) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L90;
 +          }
 +          j -= igap;
 +          goto L80;
 +L90:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L70;
 +
 +    } else if (!strncmp(which, "LM", 2)) {
 +
 +
 +L100:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L110:
 +
 +          if (j < 0) {
 +              goto L120;
 +          }
 +
 +          if (fabs(x1[j]) > fabs(x1[j + igap])) {
 +              temp = x1[j];
 +              x1[j] = x1[j + igap];
 +              x1[j + igap] = temp;
 +              if (*apply) {
 +                  temp = x2[j];
 +                  x2[j] = x2[j + igap];
 +                  x2[j + igap] = temp;
 +              }
 +          } else {
 +              goto L120;
 +          }
 +          j -= igap;
 +          goto L110;
 +L120:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L100;
 +    }
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +
 +static void 
 +F77_FUNC(ssesrt,SSESRT)(const char *    which, 
 +                        int *     apply, 
 +                        int *     n, 
 +                        float *  x, 
 +                        int *     na, 
 +                        float *  a, 
 +                        int *     lda)
 +{
 +    int a_dim1, a_offset, i__1;
 +    int c__1 = 1;
 +
 +    int i__, j, igap;
 +    float temp;
 +
 +    a_dim1 = *lda;
 +    a_offset = 1 + a_dim1 * 0;
 +    a -= a_offset;
 +
 +    igap = *n / 2;
 +
 +    if (!strncmp(which, "SA", 2)) {
 +
 +L10:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L20:
 +
 +          if (j < 0) {
 +              goto L30;
 +          }
 +
 +          if (x[j] < x[j + igap]) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(sswap,SSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L30;
 +          }
 +          j -= igap;
 +          goto L20;
 +L30:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L10;
 +
 +    } else if (!strncmp(which, "SM", 2)) {
 +
 +L40:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L50:
 +
 +          if (j < 0) {
 +              goto L60;
 +          }
 +
 +          if (fabs(x[j]) < fabs(x[j + igap])) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(sswap,SSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L60;
 +          }
 +          j -= igap;
 +          goto L50;
 +L60:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L40;
 +
 +    } else if (!strncmp(which, "LA", 2)) {
 +
 +L70:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L80:
 +
 +          if (j < 0) {
 +              goto L90;
 +          }
 +
 +          if (x[j] > x[j + igap]) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(sswap,SSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L90;
 +          }
 +          j -= igap;
 +          goto L80;
 +L90:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L70;
 +
 +    } else if (!strncmp(which, "LM", 2)) {
 +
 +L100:
 +      if (igap == 0) {
 +          goto L9000;
 +      }
 +      i__1 = *n - 1;
 +      for (i__ = igap; i__ <= i__1; ++i__) {
 +          j = i__ - igap;
 +L110:
 +
 +          if (j < 0) {
 +              goto L120;
 +          }
 +
 +          if (fabs(x[j]) > fabs(x[j + igap])) {
 +              temp = x[j];
 +              x[j] = x[j + igap];
 +              x[j + igap] = temp;
 +              if (*apply) {
 +                 F77_FUNC(sswap,SSWAP)(na, &a[j * a_dim1 + 1], &c__1, &a[(j + igap) * 
 +                          a_dim1 + 1], &c__1);
 +              }
 +          } else {
 +              goto L120;
 +          }
 +          j -= igap;
 +          goto L110;
 +L120:
 +          ;
 +      }
 +      igap /= 2;
 +      goto L100;
 +    }
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +
 +static void
 +F77_FUNC(ssgets,SSGETS)(int *     ishift, 
 +                        const char *    which, 
 +                        int *     kev, 
 +                        int *     np, 
 +                        float *  ritz, 
 +                        float *  bounds, 
 +                        float *  shifts)
 +{
 +    int c__1 = 1;
 +    int i__1, i__2;
 +    int kevd2;
 +
 +    --shifts;
 +    --bounds;
 +    --ritz;
 +
 +    if (!strncmp(which, "BE", 2)) {
 +      i__1 = *kev + *np;
 +      F77_FUNC(ssortr,SSORTR)("LA", &c__1, &i__1, &ritz[1], &bounds[1]);
 +      kevd2 = *kev / 2;
 +      if (*kev > 1) {
 +        i__1 = (kevd2<*np) ? kevd2 : *np;
 +        i__2 = (kevd2>*np) ? kevd2 : *np;
 +       F77_FUNC(sswap,SSWAP)(&i__1, &ritz[1], &c__1, 
 +               &ritz[i__2 + 1], &c__1);
 +        i__1 = (kevd2<*np) ? kevd2 : *np;
 +        i__2 = (kevd2>*np) ? kevd2 : *np;
 +       F77_FUNC(sswap,SSWAP)(&i__1, &bounds[1], &c__1, 
 +               &bounds[i__2 + 1], &c__1);
 +      }
 +
 +    } else {
 +      i__1 = *kev + *np;
 +      F77_FUNC(ssortr,SSORTR)(which, &c__1, &i__1, &ritz[1], &bounds[1]);
 +    }
 +
 +    if (*ishift == 1 && *np > 0) {
 +
 +      F77_FUNC(ssortr,SSORTR)("SM", &c__1, np, &bounds[1], &ritz[1]);
 +      F77_FUNC(scopy,SCOPY)(np, &ritz[1], &c__1, &shifts[1], &c__1);
 +    }
 +
 +
 +    return;
 +} 
 +
 +
 +
 +static void 
 +F77_FUNC(ssconv,SSCONV)(int *     n, 
 +                        float *  ritz, 
 +                        float *  bounds,
 +                        float *  tol, 
 +                        int *     nconv)
 +{
 +    float c_b3 = 2/3;
 +    int i__1;
 +    float d__2, d__3;
 +
 +    int i__;
 +    float eps23, temp;
 + 
 +    --bounds;
 +    --ritz;
 +
 +    eps23 = GMX_FLOAT_EPS;
 +    eps23 = pow(eps23, c_b3);
 +
 +    *nconv = 0;
 +    i__1 = *n;
 +    for (i__ = 1; i__ <= i__1; ++i__) {
 +
 +      d__2 = eps23;
 +      d__3 = fabs(ritz[i__]);
 +      temp = (d__2 > d__3) ? d__2 : d__3;
 +      if (bounds[i__] <= *tol * temp) {
 +      ++(*nconv);
 +      }
 +    }
 +
 +    return;
 +}
 +
 +
 +static void 
 +F77_FUNC(sseigt,SSEIGT)(float *  rnorm,
 +                        int *     n, 
 +                        float *  h__, 
 +                        int *     ldh, 
 +                        float *  eig, 
 +                        float *  bounds, 
 +                        float *  workl, 
 +                        int *     ierr)
 +{
 +    int c__1 = 1;
 +    int h_dim1, h_offset, i__1;
 +
 +    int k;
 +
 +
 +    --workl;
 +    --bounds;
 +    --eig;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +
 +   F77_FUNC(scopy,SCOPY)(n, &h__[(h_dim1 << 1) + 1], &c__1, &eig[1], &c__1);
 +    i__1 = *n - 1;
 +   F77_FUNC(scopy,SCOPY)(&i__1, &h__[h_dim1 + 2], &c__1, &workl[1], &c__1);
 +    F77_FUNC(sstqrb,SSTQRB)(n, &eig[1], &workl[1], &bounds[1], &workl[*n + 1], ierr);
 +    if (*ierr != 0) {
 +      goto L9000;
 +    }
 +
 +    i__1 = *n;
 +    for (k = 1; k <= i__1; ++k) {
 +      bounds[k] = *rnorm * fabs(bounds[k]);
 +
 +    }
 +
 +
 +L9000:
 +    return;
 +}
 +
 +
 +
 +
 +static void 
 +F77_FUNC(ssaitr,SSAITR)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        int *     k,
 +                        int *     np, 
 +                        int *     mode, 
 +                        float *  resid, 
 +                        float *  rnorm, 
 +                        float *  v, 
 +                        int *     ldv, 
 +                        float *  h__, 
 +                        int *     ldh, 
 +                        int *     ipntr, 
 +                        float *  workd, 
 +                        int *     iwork, 
 +                        int *     info)
 +{
 +
 +    int c__0 = 0;
 +    int c__1 = 1;
 +    float c_b18 = 1.;
 +    float c_b42 = 0.;
 +    float c_b50 = -1.;
 +
 +    int h_dim1, h_offset, v_dim1, v_offset, i__1;
 +    int i__, jj;
 +    float temp1;
 +    int infol;
 +    float safmin,minval;
 +
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    --ipntr;
 +    --iwork;
 +    minval = GMX_FLOAT_MIN;
 +    safmin = minval / GMX_FLOAT_EPS;
 +
 +    if (*ido == 0) {
 +      *info = 0;
 +      iwork[5] = 0;
 +      iwork[6] = 0;
 +      iwork[4] = 0;
 +      iwork[2] = 0;
 +      iwork[3] = 0;
 +
 +      iwork[12] = *k + 1;
 +
 +      iwork[8] = 1;
 +      iwork[9] = iwork[8] + *n;
 +      iwork[10] = iwork[9] + *n;
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L50;
 +    }
 +    if (iwork[6] == 1) {
 +      goto L60;
 +    }
 +    if (iwork[2] == 1) {
 +      goto L70;
 +    }
 +    if (iwork[3] == 1) {
 +      goto L90;
 +    }
 +    if (iwork[4] == 1) {
 +      goto L30;
 +    }
 +
 +L1000:
 +
 +
 +    if (*rnorm > 0.) {
 +      goto L40;
 +    }
 +
 +    iwork[11] = 1;
 +L20:
 +    iwork[4] = 1;
 +    *ido = 0;
 +L30:
 +
 +    F77_FUNC(sgetv0,sgetv0)(ido, bmat, &iwork[11], &c__0, n, &iwork[12], &v[v_offset], ldv,
 +                            &resid[1], rnorm, &ipntr[1], &workd[1], &iwork[21], &iwork[7]);
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +    if (iwork[7] < 0) {
 +      ++iwork[11];
 +      if (iwork[11] <= 3) {
 +          goto L20;
 +      }
 +
 +      *info = iwork[12] - 1;
 +      *ido = 99;
 +      goto L9000;
 +    }
 +
 +L40:
 +
 +   F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &v[iwork[12] * v_dim1 + 1], &c__1);
 +    if (*rnorm >= safmin) {
 +      temp1 = 1. / *rnorm;
 +      F77_FUNC(sscal,SSCAL)(n, &temp1, &v[iwork[12] * v_dim1 + 1], &c__1);
 +      F77_FUNC(sscal,SSCAL)(n, &temp1, &workd[iwork[8]], &c__1);
 +    } else {
 +
 +      F77_FUNC(slascl,SLASCL)("General", &i__, &i__, rnorm, &c_b18, n, &c__1, &v[iwork[12] *
 +               v_dim1 + 1], n, &infol);
 +      F77_FUNC(slascl,SLASCL)("General", &i__, &i__, rnorm, &c_b18, n, &c__1, &workd[iwork[
 +              8]], n, &infol);
 +    }
 +
 +    iwork[5] = 1;
 +   F77_FUNC(scopy,SCOPY)(n, &v[iwork[12] * v_dim1 + 1], &c__1, &workd[iwork[10]], &c__1);
 +    ipntr[1] = iwork[10];
 +    ipntr[2] = iwork[9];
 +    ipntr[3] = iwork[8];
 +    *ido = 1;
 +
 +    goto L9000;
 +L50:
 +
 +
 +    iwork[5] = 0;
 +
 +   F77_FUNC(scopy,SCOPY)(n, &workd[iwork[9]], &c__1, &resid[1], &c__1);
 +
 +    if (*mode == 2) {
 +      goto L65;
 +    }
 +    if (*bmat == 'G') {
 +      iwork[6] = 1;
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L60:
 +
 +    iwork[6] = 0;
 +
 +L65:
 +    if (*mode == 2) {
 +
 +      workd[*n * 3 + 3] =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[iwork[10]], &
 +              c__1);
 +      workd[*n * 3 + 3] = sqrt(fabs(workd[*n * 3 + 3]));
 +    } else if (*bmat == 'G') {
 +      workd[*n * 3 + 3] =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &
 +              c__1);
 +      workd[*n * 3 + 3] = sqrt(fabs(workd[*n * 3 + 3]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 3] =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*mode != 2) {
 +      F77_FUNC(sgemv,SGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[8]]
 +              , &c__1, &c_b42, &workd[iwork[9]], &c__1);
 +    } else if (*mode == 2) {
 +      F77_FUNC(sgemv,SGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[10]
 +              ], &c__1, &c_b42, &workd[iwork[9]], &c__1);
 +    }
 +
 +   F77_FUNC(sgemv,SGEMV)("N", n, &iwork[12], &c_b50, &v[v_offset], ldv, &workd[iwork[9]], &
 +          c__1, &c_b18, &resid[1], &c__1);
 +
 +    h__[iwork[12] + (h_dim1 << 1)] = workd[iwork[9] + iwork[12] - 1];
 +    if (iwork[12] == 1 || iwork[4] == 1) {
 +      h__[iwork[12] + h_dim1] = 0.;
 +    } else {
 +      h__[iwork[12] + h_dim1] = *rnorm;
 +    }
 +
 +    iwork[2] = 1;
 +    iwork[1] = 0;
 +
 +    if (*bmat == 'G') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[iwork[9]], &c__1);
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L70:
 +
 +    iwork[2] = 0;
 +
 +    if (*bmat == 'G') {
 +      *rnorm =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +      *rnorm = sqrt(fabs(*rnorm));
 +    } else if (*bmat == 'I') {
 +      *rnorm =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +    if (*rnorm > workd[*n * 3 + 3] * .717f) {
 +      goto L100;
 +    }
 +
 +L80:
 +
 +   F77_FUNC(sgemv,SGEMV)("T", n, &iwork[12], &c_b18, &v[v_offset], ldv, &workd[iwork[8]], &
 +          c__1, &c_b42, &workd[iwork[9]], &c__1);
 +
 +   F77_FUNC(sgemv,SGEMV)("N", n, &iwork[12], &c_b50, &v[v_offset], ldv, &workd[iwork[9]], &
 +          c__1, &c_b18, &resid[1], &c__1);
 +
 +    if (iwork[12] == 1 || iwork[4] == 1) {
 +      h__[iwork[12] + h_dim1] = 0.;
 +    }
 +    h__[iwork[12] + (h_dim1 << 1)] += workd[iwork[9] + iwork[12] - 1];
 +
 +    iwork[3] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[iwork[9]], &c__1);
 +      ipntr[1] = iwork[9];
 +      ipntr[2] = iwork[8];
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[iwork[8]], &c__1);
 +    }
 +L90:
 +
 +
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 2] =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[iwork[8]], &
 +              c__1);
 +      workd[*n * 3 + 2] = sqrt(fabs(workd[*n * 3 + 2]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 2] =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +
 +
 +    if (workd[*n * 3 + 2] > *rnorm * .717f) {
 +
 +      *rnorm = workd[*n * 3 + 2];
 +
 +    } else {
 +
 +      *rnorm = workd[*n * 3 + 2];
 +      ++iwork[1];
 +      if (iwork[1] <= 1) {
 +          goto L80;
 +      }
 +
 +      i__1 = *n;
 +      for (jj = 1; jj <= i__1; ++jj) {
 +          resid[jj] = 0.;
 +      }
 +      *rnorm = 0.;
 +    }
 +
 +L100:
 +
 +    iwork[4] = 0;
 +    iwork[3] = 0;
 +
 +    if (h__[iwork[12] + h_dim1] < 0.) {
 +      h__[iwork[12] + h_dim1] = -h__[iwork[12] + h_dim1];
 +      if (iwork[12] < *k + *np) {
 +         F77_FUNC(sscal,SSCAL)(n, &c_b50, &v[(iwork[12] + 1) * v_dim1 + 1], &c__1);
 +      } else {
 +         F77_FUNC(sscal,SSCAL)(n, &c_b50, &resid[1], &c__1);
 +      }
 +    }
 +
 +    ++iwork[12];
 +    if (iwork[12] > *k + *np) {
 +      *ido = 99;
 +
 +
 +      goto L9000;
 +    }
 +
 +    goto L1000;
 +
 +L9000:
 +    return;
 +}
 +
 +
 +
 +
 +
 +
 +static void 
 +F77_FUNC(ssaup2,SSAUP2)(int *     ido, 
 +                        const char *    bmat,
 +                        int *     n,
 +                        const char *    which, 
 +                        int *     nev, 
 +                        int *     np,
 +                        float *  tol, 
 +                        float *  resid, 
 +                        int *     mode, 
 +                        int *     iupd, 
 +                        int *     ishift, 
 +                        int *     mxiter, 
 +                        float *  v,
 +                        int *     ldv, 
 +                        float *  h__, 
 +                        int *     ldh, 
 +                        float *  ritz,
 +                        float *  bounds, 
 +                        float *  q, 
 +                        int *     ldq, 
 +                        float *  workl,
 +                        int *     ipntr, 
 +                        float *  workd, 
 +                        int *     iwork, 
 +                        int *     info)
 +{
 +    float c_b3 = 2/3;
 +    int c__1 = 1;
 +    int c__0 = 0;
 +    
 +    int h_dim1, h_offset, q_dim1, q_offset, v_dim1, v_offset, i__1, i__2, 
 +          i__3;
 +    float d__2, d__3;
 +    int j;
 +    float eps23;
 +    int ierr;
 +    float temp;
 +    int nevd2;
 +    int nevm2;
 +    int nevbef;
 +    char wprime[2];
 +    int nptemp;
 +
 +    --workd;
 +    --resid;
 +    --workl;
 +    --bounds;
 +    --ritz;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    h_dim1 = *ldh;
 +    h_offset = 1 + h_dim1;
 +    h__ -= h_offset;
 +    q_dim1 = *ldq;
 +    q_offset = 1 + q_dim1;
 +    q -= q_offset;
 +    --ipntr;
 +    --iwork;
 +    eps23 = GMX_FLOAT_EPS;
 +    eps23 = pow(eps23, c_b3);
 +
 +    if (*ido == 0) {
 +
 +      iwork[41] = 1;
 +      iwork[42] = 3;
 +      iwork[43] = 5;
 +      iwork[44] = 7;
 +
 +      iwork[9] = *nev;
 +      iwork[10] = *np;
 +
 +      iwork[7] = iwork[9] + iwork[10];
 +      iwork[8] = 0;
 +      iwork[6] = 0;
 +
 +      iwork[2] = 1;
 +      iwork[4] = 0;
 +      iwork[5] = 0;
 +      iwork[1] = 0;
 +
 +      if (*info != 0) {
 +
 +          iwork[3] = 1;
 +          *info = 0;
 +      } else {
 +          iwork[3] = 0;
 +      }
 +    }
 +
 +    if (iwork[2] == 1) {
 +      F77_FUNC(sgetv0,SGETV0)(ido, bmat, &c__1, &iwork[3], n, &c__1, &v[v_offset], ldv, &
 +              resid[1], &workd[*n * 3 + 1], &ipntr[1], &workd[1], &iwork[41]
 +              , info);
 +
 +      if (*ido != 99) {
 +          goto L9000;
 +      }
 +
 +      if (workd[*n * 3 + 1] == 0.) {
 +
 +          *info = -9;
 +          goto L1200;
 +      }
 +      iwork[2] = 0;
 +      *ido = 0;
 +    }
 +
 +    if (iwork[4] == 1) {
 +      goto L20;
 +    }
 +
 +    if (iwork[5] == 1) {
 +      goto L50;
 +    }
 +
 +    if (iwork[1] == 1) {
 +      goto L100;
 +    }
 +
 +    F77_FUNC(ssaitr,SSAITR)(ido, bmat, n, &c__0, &iwork[9], mode, &resid[1], &workd[*n * 3 + 
 +          1], &v[v_offset], ldv, &h__[h_offset], ldh, &ipntr[1], &workd[1], 
 +          &iwork[21], info);
 +
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    if (*info > 0) {
 +
 +      *np = *info;
 +      *mxiter = iwork[6];
 +      *info = -9999;
 +      goto L1200;
 +    }
 +
 +L1000:
 +
 +    ++iwork[6];
 +
 +
 +    *ido = 0;
 +L20:
 +    iwork[4] = 1;
 +
 +    F77_FUNC(ssaitr,SSAITR)(ido, bmat, n, nev, np, mode, &resid[1], &workd[*n * 3 + 1], &v[
 +          v_offset], ldv, &h__[h_offset], ldh, &ipntr[1], &workd[1], &iwork[
 +          21], info);
 +
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    if (*info > 0) {
 +
 +      *np = *info;
 +      *mxiter = iwork[6];
 +      *info = -9999;
 +      goto L1200;
 +    }
 +    iwork[4] = 0;
 +
 +    F77_FUNC(sseigt,SSEIGT)(&workd[*n * 3 + 1], &iwork[7], &h__[h_offset], ldh, &ritz[1], &
 +          bounds[1], &workl[1], &ierr);
 +
 +    if (ierr != 0) {
 +      *info = -8;
 +      goto L1200;
 +    }
 +
 +   F77_FUNC(scopy,SCOPY)(&iwork[7], &ritz[1], &c__1, &workl[iwork[7] + 1], &c__1);
 +   F77_FUNC(scopy,SCOPY)(&iwork[7], &bounds[1], &c__1, &workl[(iwork[7] << 1) + 1], &c__1);
 +
 +    *nev = iwork[9];
 +    *np = iwork[10];
 +    F77_FUNC(ssgets,SSGETS)(ishift, which, nev, np, &ritz[1], &bounds[1], &workl[1]);
 +
 +   F77_FUNC(scopy,SCOPY)(nev, &bounds[*np + 1], &c__1, &workl[*np + 1], &c__1);
 +    F77_FUNC(ssconv,SSCONV)(nev, &ritz[*np + 1], &workl[*np + 1], tol, &iwork[8]);
 +
 +
 +    nptemp = *np;
 +    i__1 = nptemp;
 +    for (j = 1; j <= i__1; ++j) {
 +      if (bounds[j] == 0.) {
 +          --(*np);
 +          ++(*nev);
 +      }
 +    }
 +
 +    if (iwork[8] >= iwork[9] || iwork[6] > *mxiter || *np == 0) {
 +
 +      if (!strncmp(which, "BE", 2)) {
 +
 +      strncpy(wprime, "SA",2);
 +          F77_FUNC(ssortr,SSORTR)(wprime, &c__1, &iwork[7], &ritz[1], &bounds[1]);
 +          nevd2 = *nev / 2;
 +          nevm2 = *nev - nevd2;
 +          if (*nev > 1) {
 +            i__1 = (nevd2 < *np) ? nevd2 : *np;
 +            i__2 = iwork[7] - nevd2 + 1, i__3 = iwork[7] - *np + 1;
 +           F77_FUNC(sswap,SSWAP)(&i__1, &ritz[nevm2 + 1], &c__1, 
 +                   &ritz[((i__2>i__3) ? i__2 : i__3)], 
 +                   &c__1);
 +            i__1 = (nevd2 < *np) ? nevd2 : *np;
 +            i__2 = iwork[7] - nevd2 + 1, i__3 = iwork[7] - *np;
 +           F77_FUNC(sswap,SSWAP)(&i__1, &bounds[nevm2 + 1], &c__1, 
 +                   &bounds[((i__2>i__3) ? i__2 : i__3) + 1], 
 +                   &c__1);
 +          }
 +
 +      } else {
 +
 +      if (!strncmp(which, "LM", 2)) {
 +        strncpy(wprime, "SM", 2);
 +      }
 +      if (!strncmp(which, "SM", 2)) {
 +        strncpy(wprime, "LM", 2);
 +      }
 +      if (!strncmp(which, "LA", 2)) {
 +        strncpy(wprime, "SA", 2);
 +      }
 +      if (!strncmp(which, "SA", 2)) {
 +        strncpy(wprime, "LA", 2);
 +      }
 +      
 +      F77_FUNC(ssortr,SSORTR)(wprime, &c__1, &iwork[7], &ritz[1], &bounds[1]);
 +
 +      }
 +
 +      i__1 = iwork[9];
 +      for (j = 1; j <= i__1; ++j) {
 +        d__2 = eps23;
 +        d__3 = fabs(ritz[j]);
 +          temp = (d__2>d__3) ? d__2 : d__3;
 +          bounds[j] /= temp;
 +      }
 +
 +      strncpy(wprime, "LA",2);
 +      F77_FUNC(ssortr,SSORTR)(wprime, &c__1, &iwork[9], &bounds[1], &ritz[1]);
 +
 +      i__1 = iwork[9];
 +      for (j = 1; j <= i__1; ++j) {
 +        d__2 = eps23;
 +        d__3 = fabs(ritz[j]);
 +          temp = (d__2>d__3) ? d__2 : d__3;
 +          bounds[j] *= temp;
 +      }
 +
 +      if (!strncmp(which, "BE", 2)) {
 +
 +          strncpy(wprime, "LA", 2);
 +          F77_FUNC(ssortr,SSORTR)(wprime, &c__1, &iwork[8], &ritz[1], &bounds[1]);
 +
 +      } else {
 +        F77_FUNC(ssortr,SSORTR)(which, &c__1, &iwork[8], &ritz[1], &bounds[1]);
 +      
 +      }
 +
 +      h__[h_dim1 + 1] = workd[*n * 3 + 1];
 +
 +
 +      if (iwork[6] > *mxiter && iwork[8] < *nev) {
 +          *info = 1;
 +      }
 +      if (*np == 0 && iwork[8] < iwork[9]) {
 +          *info = 2;
 +      }
 +
 +      *np = iwork[8];
 +      goto L1100;
 +
 +    } else if (iwork[8] < *nev && *ishift == 1) {
 +      nevbef = *nev;
 +      i__1 = iwork[8], i__2 = *np / 2;
 +      *nev += (i__1 < i__2) ? i__1 : i__2;
 +      if (*nev == 1 && iwork[7] >= 6) {
 +          *nev = iwork[7] / 2;
 +      } else if (*nev == 1 && iwork[7] > 2) {
 +          *nev = 2;
 +      }
 +      *np = iwork[7] - *nev;
 +
 +
 +      if (nevbef < *nev) {
 +          F77_FUNC(ssgets,SSGETS)(ishift, which, nev, np, &ritz[1], &bounds[1], &workl[1]);
 +      }
 +
 +    }
 +
 +
 +    if (*ishift == 0) {
 +
 +      iwork[5] = 1;
 +      *ido = 3;
 +      goto L9000;
 +    }
 +
 +L50:
 +
 +    iwork[5] = 0;
 +
 +    if (*ishift == 0) {
 +      F77_FUNC(scopy,SCOPY)(np, &workl[1], &c__1, &ritz[1], &c__1);
 +    }
 +
 +    F77_FUNC(ssapps,SSAPPS)(n, nev, np, &ritz[1], &v[v_offset], ldv, &h__[h_offset], ldh, &
 +                            resid[1], &q[q_offset], ldq, &workd[1]);
 +
 +    iwork[1] = 1;
 +    if (*bmat == 'G') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[*n + 1], &c__1);
 +      ipntr[1] = *n + 1;
 +      ipntr[2] = 1;
 +      *ido = 2;
 +
 +      goto L9000;
 +    } else if (*bmat == 'I') {
 +      F77_FUNC(scopy,SCOPY)(n, &resid[1], &c__1, &workd[1], &c__1);
 +    }
 +
 +L100:
 +
 +    if (*bmat == 'G') {
 +      workd[*n * 3 + 1] =F77_FUNC(sdot,SDOT)(n, &resid[1], &c__1, &workd[1], &c__1);
 +      workd[*n * 3 + 1] = sqrt(fabs(workd[*n * 3 + 1]));
 +    } else if (*bmat == 'I') {
 +      workd[*n * 3 + 1] =F77_FUNC(snrm2,SNRM2)(n, &resid[1], &c__1);
 +    }
 +    iwork[1] = 0;
 +
 +    goto L1000;
 +
 +L1100:
 +
 +    *mxiter = iwork[6];
 +    *nev = iwork[8];
 +
 +L1200:
 +    *ido = 99;
 +
 +L9000:
 +    return;
 +
 +}
 +
 +
 +
 +void 
 +F77_FUNC(ssaupd,SSAUPD)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        float *  tol, 
 +                        float *  resid, 
 +                        int *     ncv,
 +                        float *  v, 
 +                        int *     ldv, 
 +                        int *     iparam,
 +                        int *     ipntr, 
 +                        float *  workd, 
 +                        int *     iwork,
 +                        float *  workl, 
 +                        int *     lworkl,
 +                        int *     info)
 +{
 +    int v_dim1, v_offset, i__1, i__2;
 +    int j;
 +
 +    --workd;
 +    --resid;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --iparam;
 +    --ipntr;
 +    --iwork;
 +    --workl;
 +
 +    if (*ido == 0) {
 +
 +
 +      iwork[2] = 0;
 +      iwork[5] = iparam[1];
 +      iwork[10] = iparam[3];
 +      iwork[12] = iparam[4];
 +
 +      iwork[6] = 1;
 +      iwork[11] = iparam[7];
 +
 +
 +      if (*n <= 0) {
 +          iwork[2] = -1;
 +      } else if (*nev <= 0) {
 +          iwork[2] = -2;
 +      } else if (*ncv <= *nev || *ncv > *n) {
 +          iwork[2] = -3;
 +      }
 +
 +
 +      iwork[15] = *ncv - *nev;
 +
 +      if (iwork[10] <= 0) {
 +          iwork[2] = -4;
 +      }
 +      if (strncmp(which,"LM",2) && strncmp(which,"SM",2) && 
 +          strncmp(which,"LA",2) && strncmp(which,"SA",2) && 
 +          strncmp(which,"BE",2)) {
 +        iwork[2] = -5;
 +      }
 +      if (*bmat != 'I' && *bmat != 'G') {
 +          iwork[2] = -6;
 +      }
 +
 +      i__1 = *ncv;
 +      if (*lworkl < i__1 * i__1 + (*ncv << 3)) {
 +          iwork[2] = -7;
 +      }
 +      if (iwork[11] < 1 || iwork[11] > 5) {
 +          iwork[2] = -10;
 +      } else if (iwork[11] == 1 && *bmat == 'G') {
 +          iwork[2] = -11;
 +      } else if (iwork[5] < 0 || iwork[5] > 1) {
 +          iwork[2] = -12;
 +      } else if (*nev == 1 && !strncmp(which, "BE", 2)) {
 +          iwork[2] = -13;
 +      }
 +
 +      if (iwork[2] != 0) {
 +          *info = iwork[2];
 +          *ido = 99;
 +          goto L9000;
 +      }
 +
 +      if (iwork[12] <= 0) {
 +          iwork[12] = 1;
 +      }
 +      if (*tol <= 0.) {
 +        *tol = GMX_FLOAT_EPS;
 +      }
 +
 +      iwork[15] = *ncv - *nev;
 +      iwork[13] = *nev;
 +      i__2 = *ncv;
 +      i__1 = i__2 * i__2 + (*ncv << 3);
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[j] = 0.;
 +      }
 +
 +      iwork[8] = *ncv;
 +      iwork[9] = *ncv;
 +      iwork[3] = 1;
 +      iwork[16] = iwork[3] + (iwork[8] << 1);
 +      iwork[1] = iwork[16] + *ncv;
 +      iwork[4] = iwork[1] + *ncv;
 +      i__1 = *ncv;
 +      iwork[7] = iwork[4] + i__1 * i__1;
 +      iwork[14] = iwork[7] + *ncv * 3;
 +
 +      ipntr[4] = iwork[14];
 +      ipntr[5] = iwork[3];
 +      ipntr[6] = iwork[16];
 +      ipntr[7] = iwork[1];
 +      ipntr[11] = iwork[7];
 +    }
 +
 +    F77_FUNC(ssaup2,SSAUP2)(ido, bmat, n, which, &iwork[13], &iwork[15], tol, &resid[1], &
 +                            iwork[11], &iwork[6], &iwork[5], &iwork[10], &v[v_offset], ldv, &
 +                            workl[iwork[3]], &iwork[8], &workl[iwork[16]], &workl[iwork[1]], &
 +                            workl[iwork[4]], &iwork[9], &workl[iwork[7]], &ipntr[1], &workd[1]
 +                            , &iwork[21], info);
 +
 +    if (*ido == 3) {
 +      iparam[8] = iwork[15];
 +    }
 +    if (*ido != 99) {
 +      goto L9000;
 +    }
 +
 +    iparam[3] = iwork[10];
 +    iparam[5] = iwork[15];
 +
 +    if (*info < 0) {
 +      goto L9000;
 +    }
 +    if (*info == 2) {
 +      *info = 3;
 +    }
 +
 +L9000:
 +
 +    return;
 +
 +}
 +
 +
 +
 +void
 +F77_FUNC(sseupd,SSEUPD)(int *     rvec, 
 +                        const char *    howmny, 
 +                        int *     select, 
 +                        float *  d__, 
 +                        float *  z__, 
 +                        int *     ldz, 
 +                        float *  sigma, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        float *  tol, 
 +                        float *  resid, 
 +                        int *     ncv, 
 +                        float *  v,
 +                        int *     ldv, 
 +                        int *     iparam, 
 +                        int *     ipntr, 
 +                        float *  workd, 
 +                        float *  workl, 
 +                        int *     lworkl, 
 +                        int *     info)
 +{
 +    float c_b21 = 2/3;
 +    int c__1 = 1;
 +    float c_b102 = 1.;
 +    int v_dim1, v_offset, z_dim1, z_offset, i__1;
 +    float d__1, d__2, d__3;
 +
 +    int j, k, ih, iq, iw, ibd, ihb, ihd, ldh, ilg, ldq, ism, irz;
 +    int mode;
 +    float eps23;
 +    int ierr;
 +    float temp;
 +    int next;
 +    char type__[6];
 +    int ritz;
 +    int reord;
 +    int nconv;
 +    float rnorm;
 +    float bnorm2;
 +    float thres1=0, thres2=0;
 +    int bounds;
 +    int ktrord;
 +    float tempbnd;
 +    int leftptr, rghtptr;
 +
 +
 +    --workd;
 +    --resid;
 +    z_dim1 = *ldz;
 +    z_offset = 1 + z_dim1;
 +    z__ -= z_offset;
 +    --d__;
 +    --select;
 +    v_dim1 = *ldv;
 +    v_offset = 1 + v_dim1;
 +    v -= v_offset;
 +    --iparam;
 +    --ipntr;
 +    --workl;
 +
 +    mode = iparam[7];
 +    nconv = iparam[5];
 +    *info = 0;
 +
 +    if (nconv == 0) {
 +      goto L9000;
 +    }
 +    ierr = 0;
 +
 +    if (nconv <= 0) {
 +      ierr = -14;
 +    }
 +    if (*n <= 0) {
 +      ierr = -1;
 +    }
 +    if (*nev <= 0) {
 +      ierr = -2;
 +    }
 +    if (*ncv <= *nev || *ncv > *n) {
 +      ierr = -3;
 +    }
 +    if (strncmp(which,"LM",2) && strncmp(which,"SM",2) && 
 +      strncmp(which,"LA",2) && strncmp(which,"SA",2) && 
 +      strncmp(which,"BE",2)) {
 +      ierr = -5;
 +    }
 +    if (*bmat != 'I' && *bmat != 'G') {
 +      ierr = -6;
 +    }
 +    if (*howmny != 'A' && *howmny != 'P' && 
 +          *howmny != 'S' && *rvec) {
 +      ierr = -15;
 +    }
 +    if (*rvec && *howmny == 'S') {
 +      ierr = -16;
 +    }
 +    i__1 = *ncv;
 +    if (*rvec && *lworkl < i__1 * i__1 + (*ncv << 3)) {
 +      ierr = -7;
 +    }
 +
 +    if (mode == 1 || mode == 2) {
 +      strncpy(type__, "REGULR",6);
 +    } else if (mode == 3) {
 +      strncpy(type__, "SHIFTI",6);
 +    } else if (mode == 4) {
 +      strncpy(type__, "BUCKLE",6);
 +    } else if (mode == 5) {
 +      strncpy(type__, "CAYLEY",6);
 +    } else {
 +      ierr = -10;
 +    }
 +    if (mode == 1 && *bmat == 'G') {
 +      ierr = -11;
 +    }
 +    if (*nev == 1 && !strncmp(which, "BE",2)) {
 +      ierr = -12;
 +    }
 +
 +    if (ierr != 0) {
 +      *info = ierr;
 +      goto L9000;
 +    }
 +
 +    ih = ipntr[5];
 +    ritz = ipntr[6];
 +    bounds = ipntr[7];
 +    ldh = *ncv;
 +    ldq = *ncv;
 +    ihd = bounds + ldh;
 +    ihb = ihd + ldh;
 +    iq = ihb + ldh;
 +    iw = iq + ldh * *ncv;
 +    next = iw + (*ncv << 1);
 +    ipntr[4] = next;
 +    ipntr[8] = ihd;
 +    ipntr[9] = ihb;
 +    ipntr[10] = iq;
 +
 +    irz = ipntr[11] + *ncv;
 +    ibd = irz + *ncv;
 +
 +
 +    eps23 = GMX_FLOAT_EPS;
 +    eps23 = pow(eps23, c_b21);
 +
 +    rnorm = workl[ih];
 +    if (*bmat == 'I') {
 +      bnorm2 = rnorm;
 +    } else if (*bmat == 'G') {
 +      bnorm2 =F77_FUNC(snrm2,SNRM2)(n, &workd[1], &c__1);
 +    }
 +
 +    if (*rvec) {
 +
 +        if (!strncmp(which,"LM",2) || !strncmp(which,"SM",2) ||
 +            !strncmp(which,"LA",2) || !strncmp(which,"SA",2)) {
 + 
 +      } else if (!strncmp(which,"BE",2)) {
 +
 +
 +        ism = (*nev>nconv) ? *nev : nconv;
 +        ism /= 2;
 +        ilg = ism + 1;
 +        thres1 = workl[ism];
 +        thres2 = workl[ilg];
 +
 +
 +      }
 +
 +      reord = 0;
 +      ktrord = 0;
 +      i__1 = *ncv - 1;
 +      for (j = 0; j <= i__1; ++j) {
 +          select[j + 1] = 0;
 +          if (!strncmp(which,"LM",2)) {
 +              if (fabs(workl[irz + j]) >= fabs(thres1)) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                if (workl[ibd + j] <= *tol * tempbnd) {
 +                  select[j + 1] = 1;
 +                }
 +              }
 +          } else if (!strncmp(which,"SM",2)) {
 +              if (fabs(workl[irz + j]) <= fabs(thres1)) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"LA",2)) {
 +              if (workl[irz + j] >= thres1) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"SA",2)) {
 +              if (workl[irz + j] <= thres1) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          } else if (!strncmp(which,"BE",2)) {
 +              if (workl[irz + j] <= thres1 || workl[irz + j] >= thres2) {
 +                d__2 = eps23;
 +                d__3 = fabs(workl[irz + j]);
 +                  tempbnd = (d__2>d__3) ? d__2 : d__3;
 +                  if (workl[ibd + j] <= *tol * tempbnd) {
 +                      select[j + 1] = 1;
 +                  }
 +              }
 +          }
 +          if (j + 1 > nconv) {
 +              reord = select[j + 1] || reord;
 +          }
 +          if (select[j + 1]) {
 +              ++ktrord;
 +          }
 +      }
 +
 +      i__1 = *ncv - 1;
 +      F77_FUNC(scopy,SCOPY)(&i__1, &workl[ih + 1], &c__1, &workl[ihb], &c__1);
 +      F77_FUNC(scopy,SCOPY)(ncv, &workl[ih + ldh], &c__1, &workl[ihd], &c__1);
 +
 +      F77_FUNC(ssteqr,SSTEQR)("Identity", ncv, &workl[ihd], &workl[ihb], &workl[iq], &ldq, &
 +              workl[iw], &ierr);
 +
 +      if (ierr != 0) {
 +          *info = -8;
 +          goto L9000;
 +      }
 +
 +
 +      if (reord) {
 +
 +          leftptr = 1;
 +          rghtptr = *ncv;
 +
 +          if (*ncv == 1) {
 +              goto L30;
 +          }
 +
 +L20:
 +          if (select[leftptr]) {
 +
 +              ++leftptr;
 +
 +          } else if (! select[rghtptr]) {
 +
 +              --rghtptr;
 +
 +          } else {
 +
 +              temp = workl[ihd + leftptr - 1];
 +              workl[ihd + leftptr - 1] = workl[ihd + rghtptr - 1];
 +              workl[ihd + rghtptr - 1] = temp;
 +              F77_FUNC(scopy,SCOPY)(ncv, &workl[iq + *ncv * (leftptr - 1)], &c__1, &workl[
 +                      iw], &c__1);
 +              F77_FUNC(scopy,SCOPY)(ncv, &workl[iq + *ncv * (rghtptr - 1)], &c__1, &workl[
 +                      iq + *ncv * (leftptr - 1)], &c__1);
 +              F77_FUNC(scopy,SCOPY)(ncv, &workl[iw], &c__1, &workl[iq + *ncv * (rghtptr - 
 +                      1)], &c__1);
 +              ++leftptr;
 +              --rghtptr;
 +
 +          }
 +
 +          if (leftptr < rghtptr) {
 +              goto L20;
 +          }
 +
 +L30:
 +          ;
 +      }
 +
 +      F77_FUNC(scopy,SCOPY)(&nconv, &workl[ihd], &c__1, &d__[1], &c__1);
 +
 +    } else {
 +
 +      F77_FUNC(scopy,SCOPY)(&nconv, &workl[ritz], &c__1, &d__[1], &c__1);
 +      F77_FUNC(scopy,SCOPY)(ncv, &workl[ritz], &c__1, &workl[ihd], &c__1);
 +
 +    }
 +    if (!strncmp(type__, "REGULR",6)) {
 +
 +      if (*rvec) {
 +          F77_FUNC(ssesrt,SSESRT)("LA", rvec, &nconv, &d__[1], ncv, &workl[iq], &ldq);
 +      } else {
 +         F77_FUNC(scopy,SCOPY)(ncv, &workl[bounds], &c__1, &workl[ihb], &c__1);
 +      }
 +
 +    } else {
 +
 +      F77_FUNC(scopy,SCOPY)(ncv, &workl[ihd], &c__1, &workl[iw], &c__1);
 +      if (!strncmp(type__, "SHIFTI", 6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = 1. / workl[ihd + k - 1] + *sigma;
 +          }
 +      } else if (!strncmp(type__, "BUCKLE",6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = *sigma * workl[ihd + k - 1] / (workl[ihd 
 +                      + k - 1] - 1.);
 +          }
 +      } else if (!strncmp(type__, "CAYLEY",6)) {
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              workl[ihd + k - 1] = *sigma * (workl[ihd + k - 1] + 1.) / (
 +                      workl[ihd + k - 1] - 1.);
 +          }
 +      }
 +
 +      F77_FUNC(scopy,SCOPY)(&nconv, &workl[ihd], &c__1, &d__[1], &c__1);
 +      F77_FUNC(ssortr,SSORTR)("LA", &c__1, &nconv, &workl[ihd], &workl[iw]);
 +      if (*rvec) {
 +          F77_FUNC(ssesrt,SSESRT)("LA", rvec, &nconv, &d__[1], ncv, &workl[iq], &ldq);
 +      } else {
 +         F77_FUNC(scopy,SCOPY)(ncv, &workl[bounds], &c__1, &workl[ihb], &c__1);
 +          d__1 = bnorm2 / rnorm;
 +         F77_FUNC(sscal,SSCAL)(ncv, &d__1, &workl[ihb], &c__1);
 +          F77_FUNC(ssortr,SSORTR)("LA", &c__1, &nconv, &d__[1], &workl[ihb]);
 +      }
 +
 +    }
 +
 +    if (*rvec && *howmny == 'A') {
 +
 +      F77_FUNC(sgeqr2,SGEQR2)(ncv, &nconv, &workl[iq], &ldq, &workl[iw + *ncv], &workl[ihb],
 +               &ierr);
 +
 +      F77_FUNC(sorm2r,SORM2R)("Right", "Notranspose", n, ncv, &nconv, &workl[iq], &ldq, &
 +              workl[iw + *ncv], &v[v_offset], ldv, &workd[*n + 1], &ierr);
 +      F77_FUNC(slacpy,SLACPY)("All", n, &nconv, &v[v_offset], ldv, &z__[z_offset], ldz);
 +
 +      i__1 = *ncv - 1;
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[ihb + j - 1] = 0.;
 +      }
 +      workl[ihb + *ncv - 1] = 1.;
 +      F77_FUNC(sorm2r,SORM2R)("Left", "Transpose", ncv, &c__1, &nconv, &workl[iq], &ldq, &
 +              workl[iw + *ncv], &workl[ihb], ncv, &temp, &ierr);
 +
 +    } else if (*rvec && *howmny == 'S') {
 +
 +    }
 +
 +    if (!strncmp(type__, "REGULR",6) && *rvec) {
 +
 +      i__1 = *ncv;
 +      for (j = 1; j <= i__1; ++j) {
 +          workl[ihb + j - 1] = rnorm * fabs(workl[ihb + j - 1]);
 +      }
 +
 +    } else if (strncmp(type__, "REGULR",6) && *rvec) {
 +
 +      F77_FUNC(sscal,SSCAL)(ncv, &bnorm2, &workl[ihb], &c__1);
 +      if (!strncmp(type__, "SHIFTI",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              d__2 = workl[iw + k - 1];
 +              workl[ihb + k - 1] = fabs(workl[ihb + k - 1])/(d__2 * d__2);
 +          }
 +
 +      } else if (!strncmp(type__, "BUCKLE",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +              d__2 = workl[iw + k - 1] - 1.;
 +              workl[ihb + k - 1] = *sigma * fabs(workl[ihb + k - 1])/(d__2 * d__2);
 +          }
 +
 +      } else if (!strncmp(type__, "CAYLEY",6)) {
 +
 +          i__1 = *ncv;
 +          for (k = 1; k <= i__1; ++k) {
 +            workl[ihb + k - 1] = fabs(workl[ihb + k - 1] / workl[iw + k - 1] * (workl[iw + k - 1] - 1.));
 +            
 +          }
 +
 +      }
 +
 +    }
 +
 +    if (*rvec && (!strncmp(type__, "SHIFTI",6) || !strncmp(type__, "CAYLEY",6))) {
 +
 +      i__1 = nconv - 1;
 +      for (k = 0; k <= i__1; ++k) {
 +          workl[iw + k] = workl[iq + k * ldq + *ncv - 1] / workl[iw + k];
 +      }
 +
 +    } else if (*rvec && !strncmp(type__, "BUCKLE", 6)) {
 +
 +      i__1 = nconv - 1;
 +      for (k = 0; k <= i__1; ++k) {
 +          workl[iw + k] = workl[iq + k * ldq + *ncv - 1] / (workl[iw + k] - 
 +                  1.);
 +      }
 +
 +    }
 +
 +    if (strncmp(type__, "REGULR",6)) {
 +      F77_FUNC(sger,SGER)(n, &nconv, &c_b102, &resid[1], &c__1, &workl[iw], &c__1, &z__[
 +              z_offset], ldz);
 +    }
 +
 +L9000:
 +
 +    return;
 +
 +}
 +
 +
index 5b5fff88e7e54f559f5d5ca7847fc41f96db0346,0000000000000000000000000000000000000000..d7b170b591469dca01a6de578520946fd79f1f3e
mode 100644,000000..100644
--- /dev/null
@@@ -1,324 -1,0 +1,320 @@@
- #ifndef F77_FUNC
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2004
 + * David van der Spoel, Erik Lindahl, University of Groningen.
 + *
 + * This file contains a subset of ARPACK functions to perform
 + * diagonalization and SVD for sparse matrices in Gromacs.
 + *
 + * The code has been translated to C to avoid being dependent on
 + * a Fotran compiler, and it has been made threadsafe by using 
 + * additional workspace arrays to store data during reverse communication.
 + *
 + * You might prefer the original ARPACK library for general use, but
 + * in case you want to this version can be redistributed freely, just
 + * as the original library. However, please make clear that it is the
 + * hacked version from Gromacs so any bugs are blamed on us and not
 + * the original authors. You should also be aware that the double
 + * precision work array workd needs to be of size (3*N+4) here
 + * (4 more than the general library), and there is an extra argument
 + * iwork, which should be an integer work array of length 80.
 + * 
 + * ARPACK was written by 
 + *
 + *     Danny Sorensen               Phuong Vu
 + *    Riconst chard Lehoucq              CRPC / Rice University
 + *    Dept. of Computational &     Houston, Texas
 + *    Applied Mathematics
 + *    Rice University           
 + *    Houston, Texas            
 + */
 +/*! \internal \file
 + * \brief
 + * Selected routines from ARPACK
 + *
 + * This file contains a subset of ARPACK functions to perform
 + * diagonalization and SVD for sparse matrices in Gromacs.
 + *
 + * Consult the main ARPACK site for detailed documentation:
 + * http://www.caam.rice.edu/software/ARPACK/
 + *
 + * Below, we just list the options and any specific differences
 + * from ARPACK. The code is essentially the same, but the routines
 + * have been made thread-safe by using extra workspace arrays.
 + */
 +#ifndef GMX_ARPACK_H
 +#define GMX_ARPACK_H
 +
 +#ifdef HAVE_CONFIG_H
 +#include "config.h"
 +#endif
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +/*! \brief Implicitly Restarted Arnoldi Iteration, double precision.
 + *
 + *  Reverse communication interface for the Implicitly Restarted Arnoldi 
 + *  Iteration.  For symmetric problems this reduces to a variant of the
 + *  Lanczos method. See the ARPACK site for details.
 + *
 + *  \param ido     Reverse communication flag. Set to 0 first time.
 + *                 Upon return with ido=-1 or ido=1 you should calculate
 + *                 Y=A*X and recall the routine. Return with ido=2 means
 + *                 Y=B*X should be calculated. ipntr[0] is the pointer in
 + *                 workd for X, ipntr[1] is the index for Y.
 + *                 Return with ido=99 means it finished.
 + *  \param bmat    'I' for standard eigenproblem, 'G' for generalized.
 + *  \param n       Order of eigenproblem.
 + *  \param which   Which eigenvalues to calculate. 'LA' for largest 
 + *                 algebraic, 'SA' for smallest algebraic, 'LM' for largest
 + *                 magnitude, 'SM' for smallest magnitude, and finally
 + *                 'BE' (both ends) to calculate half from each end of
 + *                 the spectrum.
 + *  \param nev     Number of eigenvalues to calculate. 0<nev<n.
 + *  \param tol     Tolerance. Machine precision of it is 0.
 + *  \param resid   Optional starting residual vector at input if info=1,
 + *                 otherwise a random one is used. Final residual vector on 
 + *                 return.
 + *  \param ncv     Number of columns in matrix v.
 + *  \param v       N*NCV matrix. V contain the Lanczos basis vectors.
 + *  \param ldv     Leading dimension of v.
 + *  \param iparam  Integer array, size 11. Same contents as arpack.
 + *  \param ipntr   Integer array, size 11. Points to starting locations
 + *                 in the workd/workl arrays. Same contents as arpack.
 + *  \param workd   Double precision work array, length 3*n+4. 
 + *                 Provide the same array for all calls, and don't touch it.
 + *                 IMPORTANT: This is 4 units larger than standard ARPACK!
 + *  \param iwork   Integer work array, size 80. 
 + *                 Provide the same array for all calls, and don't touch it.
 + *                 IMPORTANT: New argument compared to standard ARPACK!
 + *  \param workl   Double precision work array, length lwork.
 + *  \param lworkl  Length of the work array workl. Must be at least ncv*(ncv+8)
 + *  \param info    Set info to 0 to use random initial residual vector,
 + *                 or to 1 if you provide a one. On output, info=0 means 
 + *                 normal exit, 1 that max number of iterations was reached,
 + *                 and 3 that no shifts could be applied. Negative numbers
 + *                 correspond to errors in the arguments provided.
 + */
 +void
 +F77_FUNC(dsaupd,DSAUPD)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        double *  tol, 
 +                        double *  resid, 
 +                        int *     ncv,
 +                        double *  v, 
 +                        int *     ldv, 
 +                        int *     iparam,
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        int *     iwork,
 +                        double *  workl, 
 +                        int *     lworkl,
 +                        int *     info);
 +
 +
 +
 +/*! \brief Get eigenvalues/vectors after Arnoldi iteration, double prec.
 + *
 + *  See the ARPACK site for details. You must have finished the interative
 + *  part with dsaupd() before calling this function.
 + *
 + *  \param rvec    1 if you want eigenvectors, 0 if not.
 + *  \param howmny  'A' if you want all nvec vectors, 'S' if you
 + *                 provide a subset selection in select[].
 + *  \param select  Integer array, dimension nev. Indices of the 
 + *                 eigenvectors to calculate. Fortran code means we
 + *                 start counting on 1. This array must be given even in
 + *                 howmny is 'A'. (Arpack documentation is wrong on this).
 + *  \param d       Double precision array, length nev. Eigenvalues.              
 + *  \param z       Double precision array, n*nev. Eigenvectors.           
 + *  \param ldz     Leading dimension of z. Normally n.
 + *  \param sigma   Shift if iparam[6] is 3,4, or 5. Ignored otherwise.
 + *  \param bmat    Provide the same argument as you did to dsaupd()
 + *  \param n       Provide the same argument as you did to dsaupd()
 + *  \param which   Provide the same argument as you did to dsaupd()
 + *  \param nev     Provide the same argument as you did to dsaupd()
 + *  \param tol     Provide the same argument as you did to dsaupd()
 + *  \param resid   Provide the same argument as you did to dsaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ncv     Provide the same argument as you did to dsaupd()
 + *  \param v       Provide the same argument as you did to dsaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ldv     Provide the same argument as you did to dsaupd()
 + *  \param iparam  Provide the same argument as you did to dsaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ipntr   Provide the same argument as you did to dsaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param workd   Provide the same argument as you did to dsaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param workl   Double precision work array, length lwork.
 + *                 The array must not be touched between the two function calls!
 + *  \param lworkl  Provide the same argument as you did to dsaupd()
 + *  \param info    Provide the same argument as you did to dsaupd()
 + */
 +void
 +F77_FUNC(dseupd,DSEUPD)(int *     rvec, 
 +                        const char *    howmny, 
 +                        int *     select, 
 +                        double *  d, 
 +                        double *  z, 
 +                        int *     ldz, 
 +                        double *  sigma, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        double *  tol, 
 +                        double *  resid, 
 +                        int *     ncv, 
 +                        double *  v,
 +                        int *     ldv, 
 +                        int *     iparam, 
 +                        int *     ipntr, 
 +                        double *  workd, 
 +                        double *  workl, 
 +                        int *     lworkl, 
 +                        int *     info);
 +
 +
 +
 +
 +
 +/*! \brief Implicitly Restarted Arnoldi Iteration, single precision.
 + *
 + *  Reverse communication interface for the Implicitly Restarted Arnoldi 
 + *  Iteration.  For symmetric problems this reduces to a variant of the
 + *  Lanczos method. See the ARPACK site for details.
 + *
 + *  \param ido     Reverse communication flag. Set to 0 first time.
 + *                 Upon return with ido=-1 or ido=1 you should calculate
 + *                 Y=A*X and recall the routine. Return with ido=2 means
 + *                 Y=B*X should be calculated. ipntr[0] is the pointer in
 + *                 workd for X, ipntr[1] is the index for Y.
 + *                 Return with ido=99 means it finished.
 + *  \param bmat    'I' for standard eigenproblem, 'G' for generalized.
 + *  \param n       Order of eigenproblem.
 + *  \param which   Which eigenvalues to calculate. 'LA' for largest 
 + *                 algebraic, 'SA' for smallest algebraic, 'LM' for largest
 + *                 magnitude, 'SM' for smallest magnitude, and finally
 + *                 'BE' (both ends) to calculate half from each end of
 + *                 the spectrum.
 + *  \param nev     Number of eigenvalues to calculate. 0<nev<n.
 + *  \param tol     Tolerance. Machine precision of it is 0.
 + *  \param resid   Optional starting residual vector at input if info=1,
 + *                 otherwise a random one is used. Final residual vector on 
 + *                 return.
 + *  \param ncv     Number of columns in matrix v.
 + *  \param v       N*NCV matrix. V contain the Lanczos basis vectors.
 + *  \param ldv     Leading dimension of v.
 + *  \param iparam  Integer array, size 11. Same contents as arpack.
 + *  \param ipntr   Integer array, size 11. Points to starting locations
 + *                 in the workd/workl arrays. Same contents as arpack.
 + *  \param workd   Single precision work array, length 3*n+4. 
 + *                 Provide the same array for all calls, and don't touch it.
 + *                 IMPORTANT: This is 4 units larger than standard ARPACK!
 + *  \param iwork   Integer work array, size 80. 
 + *                 Provide the same array for all calls, and don't touch it.
 + *                 IMPORTANT: New argument compared to standard ARPACK!
 + *  \param workl   Single precision work array, length lwork.
 + *  \param lworkl  Length of the work array workl. Must be at least ncv*(ncv+8)
 + *  \param info    Set info to 0 to use random initial residual vector,
 + *                 or to 1 if you provide a one. On output, info=0 means 
 + *                 normal exit, 1 that max number of iterations was reached,
 + *                 and 3 that no shifts could be applied. Negative numbers
 + *                 correspond to errors in the arguments provided.
 + */
 +void 
 +F77_FUNC(ssaupd,SSAUPD)(int *     ido, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        float *   tol, 
 +                        float *   resid, 
 +                        int *     ncv,
 +                        float *   v, 
 +                        int *     ldv, 
 +                        int *     iparam,
 +                        int *     ipntr, 
 +                        float *   workd, 
 +                        int *     iwork,
 +                        float *   workl, 
 +                        int *     lworkl,
 +                        int *     info);
 +
 +
 +
 +
 +
 +/*! \brief Get eigenvalues/vectors after Arnoldi iteration, single prec.
 + *
 + *  See the ARPACK site for details. You must have finished the interative
 + *  part with ssaupd() before calling this function.
 + *
 + *  \param rvec    1 if you want eigenvectors, 0 if not.
 + *  \param howmny  'A' if you want all nvec vectors, 'S' if you
 + *                 provide a subset selection in select[].
 + *  \param select  Integer array, dimension nev. Indices of the 
 + *                 eigenvectors to calculate. Fortran code means we
 + *                 start counting on 1. This array must be given even in
 + *                 howmny is 'A'. (Arpack documentation is wrong on this).
 + *  \param d       Single precision array, length nev. Eigenvalues.              
 + *  \param z       Single precision array, n*nev. Eigenvectors.           
 + *  \param ldz     Leading dimension of z. Normally n.
 + *  \param sigma   Shift if iparam[6] is 3,4, or 5. Ignored otherwise.
 + *  \param bmat    Provide the same argument as you did to ssaupd()
 + *  \param n       Provide the same argument as you did to ssaupd()
 + *  \param which   Provide the same argument as you did to ssaupd()
 + *  \param nev     Provide the same argument as you did to ssaupd()
 + *  \param tol     Provide the same argument as you did to ssaupd()
 + *  \param resid   Provide the same argument as you did to ssaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ncv     Provide the same argument as you did to ssaupd()
 + *  \param v       Provide the same argument as you did to ssaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ldv     Provide the same argument as you did to ssaupd()
 + *  \param iparam  Provide the same argument as you did to ssaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param ipntr   Provide the same argument as you did to ssaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param workd   Provide the same argument as you did to ssaupd()
 + *                 The array must not be touched between the two function calls!
 + *  \param workl   Single precision work array, length lwork.
 + *                 The array must not be touched between the two function calls!
 + *  \param lworkl  Provide the same argument as you did to ssaupd()
 + *  \param info    Provide the same argument as you did to ssaupd()
 + */
 +void
 +F77_FUNC(sseupd,SSEUPD)(int *     rvec, 
 +                        const char *    howmny, 
 +                        int *     select, 
 +                        float *   d, 
 +                        float *   z, 
 +                        int *     ldz, 
 +                        float *   sigma, 
 +                        const char *    bmat, 
 +                        int *     n, 
 +                        const char *    which, 
 +                        int *     nev, 
 +                        float *   tol, 
 +                        float *   resid, 
 +                        int *     ncv, 
 +                        float *   v,
 +                        int *     ldv, 
 +                        int *     iparam, 
 +                        int *     ipntr, 
 +                        float *   workd, 
 +                        float *   workl, 
 +                        int *     lworkl, 
 +                        int *     info);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
 +
index f345d45a9a93045129aa56fc89ef4ef2966a58fa,0000000000000000000000000000000000000000..e9ffbd72bd697535b314ddd4d3ded283aea6b21e
mode 100644,000000..100644
--- /dev/null
@@@ -1,218 -1,0 +1,215 @@@
- #ifndef F77_FUNC
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +/*! \internal \file
 + * \brief
 + * Header definitions for the standard BLAS library.
 + *
 + * This is the subset of BLAS routines used for the
 + * linear algebra operations in Gromacs. 
 + * Do NOT use this for other purposes - we only provide this as a 
 + * simple fallback/reference implementation when no optimized BLAS 
 + * is present. If you need an implementation for your own code 
 + * there are several much faster versions out there.
 + *
 + * All routines are compatible with the BLAS reference implementation,
 + * meaning they assume fortran-style matrix row/column organization.
 + *
 + * There is plenty of documentation for these routines available
 + * at http://www.netlib.org/blas , so there is no point in repeating
 + * it here.
 + */
 +#ifndef GMX_BLAS_H
 +#define GMX_BLAS_H
 +
 +/*! \cond */
 +
 +#ifdef HAVE_CONFIG_H
 +#include "config.h"
 +#endif
 +
 +/* Suppress Cygwin compiler warnings from using newlib version of
 + * ctype.h */
 +#ifdef GMX_CYGWIN
 +#undef toupper
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +}
 +#endif
 +
 +/* Double precision versions */
 +double
 +F77_FUNC(dasum,DASUM)(int *n, double *dx, int *incx);
 +
 +void
 +F77_FUNC(daxpy,DAXPY)(int *n, double *da, double *dx, int *incx, double *dy, int *incy);
 +
 +void
 +F77_FUNC(dcopy,DCOPY)(int *n, double *dx, int *incx, double *dy, int *incy);
 +
 +double
 +F77_FUNC(ddot,DDOT)(int *n, double *dx, int *incx, double *dy, int *incy);
 +
 +void
 +F77_FUNC(dgemm,DGEMM)(const char *transa, const char *transb, int *m, int *n, int *k, 
 +       double *alpha, double *a, int *lda, double *b, int *ldb, 
 +       double *beta, double *c, int *ldc);
 +
 +void
 +F77_FUNC(dgemv,DGEMV)(const char *trans, int *m, int *n, double *alpha, double *a, int *lda,
 +       double *x, int *incx, double *beta, double *y, int *incy);
 +
 +void
 +F77_FUNC(dger,DGER)(int *m, int *n, double *alpha, double *x, int *incx, 
 +      double *y, int *incy, double *a, int *lda);
 +
 +double
 +F77_FUNC(dnrm2,DNRM2)(int  *n, double *x, int *incx);
 +
 +void
 +F77_FUNC(drot,DROT)(int *n, double *dx, int *incx, 
 +      double *dy, int *incy, double *c, double *s);
 +
 +void 
 +F77_FUNC(dscal,DSCAL)(int *n, double *fact, double *dx, int *incx);
 +
 +void
 +F77_FUNC(dswap,DSWAP)(int *n, double *dx, int *incx, double *dy, int *incy);
 +
 +void
 +F77_FUNC(dsymv,DSYMV)(const char *uplo, int *n, double *alpha, double *a, int *lda,
 +       double *x, int *incx, double *beta, double *y, int *incy);
 +
 +void
 +F77_FUNC(dsyr2,DSYR2)(const char *uplo, int *n, double *alpha, double *x, int *incx,
 +       double *y, int *incy, double *a, int *lda);
 +
 +void
 +F77_FUNC(dsyr2k,DSYR2K)(const char *uplo, const char *trans, int *n, int *k, double *alpha, double *a,
 +        int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
 +
 +void 
 +F77_FUNC(dtrmm,DTRMM)(const char *side, const char *uplo, const char *transa, const char *diag, int *m, int *n, 
 +       double *alpha, double *a, int *lda, double *b, int *ldb);
 +
 +void 
 +F77_FUNC(dtrmv,DTRMV)(const char *uplo, const char *trans, const char *diag, int *n, 
 +       double *a, int *lda, double *x, int *incx);
 +
 +void
 +F77_FUNC(dtrsm,DTRSM)(const char *side, const char *uplo, const char *transa, const char *diag, int *m, int *n,
 +       double *alpha, double *a,int *lda, double *b, int *ldb);
 +
 +int
 +F77_FUNC(idamax,IDAMAX)(int *n, double *dx, int *incx);
 +
 +
 +
 +/* Single precision versions */
 +float
 +F77_FUNC(sasum,SASUM)(int *n, float *dx, int *incx);
 +
 +void
 +F77_FUNC(saxpy,SAXPY)(int *n, float *da, float *dx, int *incx, float *dy, int *incy);
 +
 +void
 +F77_FUNC(scopy,SCOPY)(int *n, float *dx, int *incx, float *dy, int *incy);
 +
 +float
 +F77_FUNC(sdot,SDOT)(int *n, float *dx, int *incx, float *dy, int *incy);
 +
 +void
 +F77_FUNC(sgemm,SGEMM)(const char *transa, const char *transb, int *m, int *n, int *k, 
 +       float *alpha, float *a, int *lda, float *b, int *ldb, 
 +       float *beta, float *c, int *ldc);
 +
 +void
 +F77_FUNC(sgemv,SGEMV)(const char *trans, int *m, int *n, float *alpha, float *a, int *lda,
 +       float *x, int *incx, float *beta, float *y, int *incy);
 +
 +void
 +F77_FUNC(sger,SGER)(int *m, int *n, float *alpha, float *x, int *incx, 
 +      float *y, int *incy, float *a, int *lda);
 +
 +float
 +F77_FUNC(snrm2,SNRM2)(int  *n, float *x, int *incx);
 +
 +void
 +F77_FUNC(srot,SROT)(int *n, float *dx, int *incx, 
 +      float *dy, int *incy, float *c, float *s);
 +
 +void 
 +F77_FUNC(sscal,SSCAL)(int *n, float *fact, float *dx, int *incx);
 +
 +void
 +F77_FUNC(sswap,SSWAP)(int *n, float *dx, int *incx, float *dy, int *incy);
 +
 +void
 +F77_FUNC(ssymv,SSYMV)(const char *uplo, int *n, float *alpha, float *a, int *lda,
 +       float *x, int *incx, float *beta, float *y, int *incy);
 +
 +void
 +F77_FUNC(ssyr2,SSYR2)(const char *uplo, int *n, float *alpha, float *x, int *incx,
 +       float *y, int *incy, float *a, int *lda);
 +
 +void
 +F77_FUNC(ssyr2k,SSYR2K)(const char *uplo, const char *trans, int *n, int *k, float *alpha, float *a,
 +        int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
 +
 +void 
 +F77_FUNC(strmm,STRMM)(const char *side, const char *uplo, const char *transa, const char *diag, int *m, int *n, 
 +       float *alpha, float *a, int *lda, float *b, int *ldb);
 +
 +void 
 +F77_FUNC(strmv,STRMV)(const char *uplo, const char *trans, const char *diag, int *n, 
 +       float *a, int *lda, float *x, int *incx);
 +
 +void
 +F77_FUNC(strsm,STRSM)(const char *side, const char *uplo, const char *transa, const char *diag, int *m, int *n,
 +       float *alpha, float *a,int *lda, float *b, int *ldb);
 +
 +int
 +F77_FUNC(isamax,ISAMAX)(int *n, float *dx, int *incx);
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +/*! \endcond */
 +
 +#endif /* GMX_BLAS_H */
index 548acf40da9ec458530dc8ff7fbe0be97790a7cd,0000000000000000000000000000000000000000..db616395e84b078b4564f251da3f2751543b9c1c
mode 100644,000000..100644
--- /dev/null
@@@ -1,868 -1,0 +1,864 @@@
- #ifndef F77_FUNC
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +/*! \internal \file
 + * \brief
 + * Header definitions for the standard LAPACK library.
 + *
 + * This is the subset of LAPACK routines used for the
 + * linear algebra operations in Gromacs. Most of the execution time
 + * will be spent in the BLAS routines, which you hopefully have an
 + * optimized version of. Gromacs includes reference implementations
 + * of both BLAS and LAPACK so it compiles everywhere, but you should
 + * really try to find a vendor or otherwise optimized version at least
 + * of BLAS for better performance.
 + *
 + * Do NOT use this code for other purposes - we only provide this as a 
 + * simple fallback/reference implementation when no optimized BLAS 
 + * is present. If you need an implementation for your own code 
 + * there are several much faster versions out there.
 + *
 + * All routines are compatible with the LAPACK/BLAS reference implementations,
 + * meaning they assume fortran-style matrix row/column organization.
 + *
 + * There is plenty of documentation for these routines available
 + * at http://www.netlib.org/lapack , so there is no point in repeating
 + * it here.
 + */
 +#ifndef GMX_LAPACK_H
 +#define GMX_LAPACK_H
 +
 +/*! \cond */
 +
 +#ifdef HAVE_CONFIG_H
 +#include "config.h"
 +#endif
 +
 +/* Suppress Cygwin compiler warnings from using newlib version of
 + * ctype.h */
 +#ifdef GMX_CYGWIN
 +#undef toupper
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +}
 +#endif
 +/* Double precision */
 +
 +void
 +F77_FUNC(dbdsdc,DBDSDC)(const char *uplo, const char *compq, int *n, double *d, double *e, double *u, 
 +      int *ldu, double *vt, int *ldvt, double *q, int *iq, double *work, 
 +      int *iwork, int *info);
 +
 +void
 +F77_FUNC(dgetf2,DGETF2)(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
 +
 +void
 +F77_FUNC(dlamrg,DLAMRG)(int *n1, int *n2, double *a, int *dtrd1, int *dtrd2, int *index);
 +
 +void
 +F77_FUNC(dlarnv,DLARNV)(int *idist, int *iseed, int *n, double *x);
 +
 +void 
 +F77_FUNC(dlasd0,DLASD0)(int *n, int *sqre, double *d, double *e, double *u, 
 +      int *ldu, double *vt, int *ldvt, int *smlsiz, int *iwork, 
 +      double *work, int *info);
 +
 +void 
 +F77_FUNC(dlasda,DLASDA)(int *icompq, int *smlsiz, int *n, int *sqre, double *d, double *e, 
 +      double *u, int *ldu, double *vt, int *k, double *difl, double *difr, 
 +      double *z, double *poles, int *givptr, int *givcol, int *ldgcol, 
 +      int *perm, double *givnum, double *c, double *s, 
 +      double *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(dlasq6,DLASQ6)(int *i0, int *n0, double *z, int *pp, double *dmin, double *dmin1, 
 +      double *dmin2, double *dn, double *dnm1, double *dnm2);
 +
 +void
 +F77_FUNC(dorgl2,DORGL2)(int *m,       int *n, int *k, double *a, int *lda, 
 +      double *tau, double *work, int *info);
 +
 +void
 +F77_FUNC(dbdsqr,DBDSQR)(const char *uplo, int *n, int *ncvt, int *nru, int *ncc, double *d, 
 +      double *e, double *vt, int *ldvt, double *u, int *ldu,
 +      double *c, int *ldc, double *work, int *info);
 +
 +void
 +F77_FUNC(dgetrf,DGETRF)(int *m,       int *n, double *a, int *lda, int *ipiv, int *info);
 +
 +void
 +F77_FUNC(dgetri,DGETRI)(int *n,       double *a, int *lda, int *ipiv, double *work, 
 +      int *lwork, int *info);
 +
 +void
 +F77_FUNC(dgetrs,DGETRS)(const char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv,
 +      double *b, int *ldb, int *info);
 +
 +void
 +F77_FUNC(dtrtri,DTRTRI)(const char *uplo, const char *diag, int *n, double *a, int *lda, int *info);
 +
 +void
 +F77_FUNC(dtrti2,DTRTI2)(const char *uplo, const char *diag, int *n, double *a, int *lda, int *info);
 +
 +double
 +F77_FUNC(dlange,DLANGE)(const char *norm, int *m, int *n, double *a, int *lda, double *work);
 +
 +void
 +F77_FUNC(dlarrbx,DLARRBX)(int *n, double *d, double *l, double *ld, double *lld, int *ifirst,
 +       int *ilast, double *rtol1, double *rtol2, int *offset, double *w,
 +       double *wgap, double *werr, double *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(dlasd1,DLASD1)(int *nl, int *nr, int *sqre, double *d, double *alpha, double *beta, 
 +      double *u, int *ldu, double *vt, int *ldvt, int *idxq, int *iwork, 
 +      double *work, int *info);
 +
 +void
 +F77_FUNC(dlasdq,DLASDQ)(const char *uplo, int *sqre, int *n, int *ncvt, int *nru, int *ncc,
 +      double *d, double *e, double *vt, int *ldvt, double *u, int *ldu, 
 +      double *c, int *ldc, double *work, int *info);
 +
 +void 
 +F77_FUNC(dlasr,DLASR)(const char *side, const char *pivot, const char *direct, int *m, int *n, double *c, 
 +       double *s, double *a, int *lda);
 +
 +void 
 +F77_FUNC(dorglq,DORGLQ)(int *m, int *n, int *k, double *a, int *lda, 
 +      double *tau, double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dormtr,DORMTR)(const char *side, const char *uplo, const char *trans, int *m, int *n, double *a, 
 +      int *lda, double *tau, double *c, int *ldc,
 +      double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dgebd2,DGEBD2)(int *m, int *n, double *a, int *lda, double *d, double *e,
 +      double *tauq, double *taup, double *work, int *info);
 +
 +void 
 +F77_FUNC(dlabrd,DLABRD)(int *m, int *n, int *nb, double *a, int *lda, double *d,
 +      double *e, double *tauq, double *taup, double *x,
 +      int *ldx, double *y, int *ldy);
 +
 +double
 +F77_FUNC(dlanst,DLANST)(const char *norm, int *n, double *d, double *e);
 +
 +double
 +F77_FUNC(dlansy,DLANSY)(const char *norm, const char *uplo, int *n, double *a, int *lda, double *work);
 +
 +void
 +F77_FUNC(dlarrex,DLARREX)(const char *range, int *n, double *vl, double *vu, int *il, int *iu,
 +       double *d, double *e, double *tol, int *nsplit, 
 +       int *isplit, int *m, double *w, int *iblock, int *indexw,
 +       double *gersch, double *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(dlasd2,DLASD2)(int *nl, int *nr, int *sqre, int *k, double *d, double *z, 
 +      double *alpha, double *beta, double *u, int *ldu, double *vt, 
 +      int *ldvt, double *dsigma, double *u2, int *ldu2, double *vt2, 
 +      int *ldvt2, int *idxp, int *idx, int *idxc, 
 +      int *idxq, int *coltyp, int *info);
 +
 +void
 +F77_FUNC(dlasdt,DLASDT)(int *n, int *lvl, int *nd, int *inode, int *ndiml, 
 +      int *ndimr, int *msub);
 +
 +void 
 +F77_FUNC(dlasrt,DLASRT)(const char *id, int *n, double *d, int *info);
 +
 +void
 +F77_FUNC(dlasrt2,DLASRT2)(const char *id, int *n, double *d, int *key, int *info);
 +
 +void
 +F77_FUNC(ilasrt2,ILASRT2)(const char *id, int *n, int *d, int *key, int *info);
 +
 +void 
 +F77_FUNC(dorgqr,DORGQR)(int *m, int *n, int *k, double *a, int *lda, double *tau, 
 +      double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dstebz,DSTEBZ)(const char *range, const char *order, int *n, double *vl, double *vu, 
 +      int *il, int *iu, double *abstol, double *d, double *e, 
 +      int *m, int *nsplit, double *w, int *iblock, int *isplit, 
 +      double *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(dsteqr,DSTEQR)(const char *compz, int *n, double *d__, double *e, 
 +        double *z__,  int *ldz, double *work, int *info);
 +
 +void
 +F77_FUNC(dgebrd,DGEBRD)(int *m, int *n, double *a, int *lda, double *d, double *e,
 +      double *tauq, double *taup, double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dlacpy,DLACPY)(const char *uplo, int *m, int *n, double *a, int *lda, double *b, int *ldb);
 +
 +double
 +F77_FUNC(dlapy2,DLAPY2)(double * x, double * y);
 +
 +
 +void
 +F77_FUNC(dlarrfx,DLARRFX)(int *n, double *d, double *l, double *ld, double *lld, int *ifirst,
 +        int *ilast, double *w, double *sigma, double *dplus, double *lplus,
 +        double *work, int *info);
 +
 +void 
 +F77_FUNC(dlasd3,DLASD3)(int *nl, int *nr, int *sqre, int *k, double *d, double *q, int *ldq, 
 +      double *dsigma, double *u, int *ldu, double *u2, int *ldu2, 
 +      double *vt, int *ldvt, double *vt2, int *ldvt2, int *idxc, 
 +      int *ctot, double *z, int *info);
 +
 +void
 +F77_FUNC(dlaset,DLASET)(const char *uplo, int *m, int *n, double *alpha, 
 +      double *beta, double *a, int *lda);
 +
 +void
 +F77_FUNC(dlassq,DLASSQ)(int *n, double *x, int *incx, double *scale, double *sumsq);
 +
 +void
 +F77_FUNC(dorm2l,DORM2L)(const char *side, const char *trans, int *m, int *n, int *k, double *a, int *lda, 
 +      double *tau, double *c, int *ldc, double *work, int *info);
 +
 +void
 +F77_FUNC(dstegr,DSTEGR)(const char *jobz, const char *range, int *n, double *d, double *e, double *vl, 
 +      double *vu, int *il, int *iu, double *abstol, int *m, double *w, 
 +      double *z, int *ldz, int *isuppz, double *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(ssteqr,SSTEQR)(const char *compz, int *n, float *d__, float *e, 
 +        float *z__,  int *ldz, float *work, int *info);
 +
 +void
 +F77_FUNC(dgelq2,DGELQ2)(int *m, int *n, double *a, int *lda, double *tau, double *work, int *info);
 +
 +void
 +F77_FUNC(dlae2,DLAE2)(double *a, double *b, double *c, double *rt1, double *rt2);
 +
 +void
 +F77_FUNC(dlaev2,DLAEV2)(double *a, double *b, double *c, double *rt1, double *rt2,
 +      double *cs1, double *cs2);
 +
 +void
 +F77_FUNC(dlar1vx,DLAR1VX)(int *n, int *b1, int *bn, double *sigma, double *d, double *l, double *ld, 
 +                          double *lld, double *eval, double *gersch, double *z, double *ztz, double *mingma, 
 +                          int *r, int *isuppz, double *work);
 +
 +void
 +F77_FUNC(dlarrvx,DLARRVX)(int *n, double *d, double *l, int *isplit, int *m, double *w, 
 +       int *iblock, int *indexw, double *gersch, double *tol, double *z, int *ldz, 
 +       int *isuppz, double *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(dlasd4,DLASD4)(int *n, int *i, double *d, double *z, double *delta, 
 +      double *rho, double *sigma, double *work, int *info);
 +
 +void
 +F77_FUNC(dlasq1,DLASQ1)(int *n,       double *d, double *e, double *work, int *info);
 +
 +
 +void 
 +F77_FUNC(dlasv2,DLASV2)(double *f, double *g, double *h, double *ssmin, double *ssmax, 
 +      double *snr, double *csr, double *snl, double *csl);
 +
 +void 
 +F77_FUNC(dorm2r,DORM2R)(const char *side, const char *trans, int *m, int *n, int *k, double *a, 
 +      int *lda, double *tau, double *c, int *ldc, double *work, int *info);
 +
 +void
 +F77_FUNC(dstein,DSTEIN)(int *n, double *d, double *e, int *m, double *w, int *iblock, int *isplit, 
 +      double *z, int *ldz, double *work, int *iwork, int *ifail, int *info);
 +
 +void
 +F77_FUNC(dgelqf,DGELQF)(int *m,       int *n, double *a, int *lda, double *tau,
 +      double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dlaebz,DLAEBZ)(int *ijob, int *nitmax, int *n, int *mmax, int *minp, int *nbmin,
 +      double *abstol, double *reltol, double *pivmin, double *d, double *e,
 +      double *e2, int *nval, double *ab, double *c, int *mout, int *nab,
 +      double *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(dlarf,DLARF)(const char *side, int *m, int *n, double *v, int *incv, double *tau,
 +       double *c, int *ldc, double *work);
 +
 +void
 +F77_FUNC(dlartg,DLARTG)(double *f, double *g, double *cs, double *sn, double *r);
 +
 +void 
 +F77_FUNC(dlasd5,DLASD5)(int *i, double *d, double *z, double *delta, 
 +      double *rho, double *dsigma, double *work);
 +
 +void 
 +F77_FUNC(dlasq2,DLASQ2)(int *n, double *z, int *info);
 +
 +void 
 +F77_FUNC(dlasq3,DLASQ3)(int *i0, int *n0, double *z, int *pp, double *dmin, 
 +      double *sigma, double *desig, double *qmax, int *nfail, 
 +      int *iter, int *ndiv, int *ieee);
 +
 +void
 +F77_FUNC(dlaswp,DLASWP)(int *n,       double *a, int *lda, int *k1, int *k2, int *ipiv, int *incx);
 +
 +void 
 +F77_FUNC(dormbr,DORMBR)(const char *vect, const char *side, const char *trans, int *m, int *n, int *k, 
 +      double *a, int *lda, double *tau, double *c, int *ldc, double *work,
 +      int *lwork, int *info);
 +
 +void
 +F77_FUNC(dsterf,DSTERF)(int *n, double *d, double *e, int *info);
 +
 +void
 +F77_FUNC(dgeqr2,DGEQR2)(int *m,       int *n, double *a, int *lda, double *tau, 
 +      double *work, int *info);
 +
 +void 
 +F77_FUNC(dlaed6,DLAED6)(int *kniter, int *orgati, double *rho, double *d, 
 +      double *z, double *finit, double *tau, int *info);
 +
 +void 
 +F77_FUNC(dlarfb,DLARFB)(const char *side, const char *trans, const char *direct, const char *storev, int *m, int *n, 
 +      int *k, double *v, int *ldv, double *t, int *ldt, double *c,
 +      int *ldc, double *work, int *ldwork);
 +
 +void
 +F77_FUNC(dlaruv,DLARUV)(int *iseed, int *n, double *x);
 +
 +void 
 +F77_FUNC(dlasd6,DLASD6)(int *icompq, int *nl, int *nr, int *sqre, double *d, double *vf, 
 +      double *vl, double *alpha, double *beta, int *idxq, int *perm, 
 +      int *givptr, int *givcol, int *ldgcol, double *givnum, int *ldgnum, 
 +      double *poles, double *difl, double *difr, double *z, int *k, 
 +      double *c, double *s, double *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(dlatrd,DLATRD)(const char *uplo, int *n, int *nb, double *a, int *lda, double *e, 
 +      double * tau, double *w, int *ldw);
 +
 +void
 +F77_FUNC(dorml2,DORML2)(const char *side, const char *trans, int *m, int *n, int *k, double *a,
 +      int *lda, double *tau, double *c, int *ldc, double *work, int *info);
 +
 +void
 +F77_FUNC(dstevr,DSTEVR)(const char *jobz, const char *range, int *n, double *d, double *e, double *vl, 
 +      double *vu, int *il, int *iu, double *abstol, int *m, double *w, 
 +      double *z, int *ldz, int *isuppz, double *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(dsytrd,DSYTRD)(const char *uplo, int *n, double *  a, int *lda, double *d, 
 +      double *e, double *tau, double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dsyevr,DSYEVR)(const char *jobz, const char *range, const char *uplo, int *n, 
 +      double *a, int *lda, double *vl, double *vu, int *
 +      il, int *iu, double *abstol, int *m, double *w, 
 +      double *z__, int *ldz, int *isuppz, double *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(dormql,DORMQL)(const char *side, const char *trans, int *m, int *n, 
 +      int *k, double *a, int *lda, double *tau, double *
 +      c, int *ldc, double *work, int *lwork, int *info);
 +
 +void 
 +F77_FUNC(dormqr,DORMQR)(const char *side, const char *trans, int *m, int *n, int *k, double *a, 
 +        int *lda, double *tau, double *c, int *ldc, 
 +        double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dorgbr,DORGBR)(const char *vect, int *m, int *n, int *k, double *a, int *lda,
 +      double *tau, double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dlasq5,DLASQ5)(int *i0, int *n0, double *z, int *pp, double *tau, double *dmin, 
 +      double *dmin1, double *dmin2, double *dn, double *dnm1, 
 +      double *dnm2, int *ieee);
 +
 +void 
 +F77_FUNC(dlasd8,DLASD8)(int *icompq, int *k, double *d, double *z, double *vf, double *vl, 
 +      double *difl, double *difr, int *lddifr, double *dsigma, 
 +      double *work, int *info);
 +
 +void
 +F77_FUNC(dlascl,DLASCL)(const char *type, int *kl, int *ku, double *cfrom, double *cto, int *m, 
 +      int *n, double *a, int *lda, int *info);
 +
 +void 
 +F77_FUNC(dlarft,DLARFT)(const char *direct, const char *storev, int *n, int *k, double *v, 
 +      int *ldv, double *tau, double *t, int *ldt);
 +
 +void
 +F77_FUNC(dlagts,DLAGTS)(int *job, int *n, double *a, double *b, double *c, double *d, 
 +      int *in, double *y, double *tol, int *info);
 +
 +void 
 +F77_FUNC(dgesdd,DGESDD)(const char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, 
 +      int *ldu, double *vt, int *ldvt, double *work, int *lwork, 
 +      int *iwork, int *info);
 +
 +void
 +F77_FUNC(dsytd2,DSYTD2)(const char *uplo, int *n, double *a, int *lda, double *d, 
 +      double *e, double *tau, int *info);
 +
 +void 
 +F77_FUNC(dormlq,DORMLQ)(const char *side, const char *trans, int *m, int *n, int *k, double *a, int *lda, 
 +      double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(dorg2r,DORG2R)(int *m, int *n,       int *k, double *a, int *lda, double *tau,
 +      double *work, int *info);
 +
 +void 
 +F77_FUNC(dlasq4,DLASQ4)(int *i0, int *n0, double *z, int *pp, int *n0in, double *dmin, 
 +      double *dmin1, double *dmin2, double *dn, double *dn1, 
 +      double *dn2, double *tau, int *ttype);
 +
 +void 
 +F77_FUNC(dlasd7,DLASD7)(int *icompq, int *nl, int *nr, int *sqre, int *k, double *d, double *z,
 +      double *zw, double *vf, double *vfw, double *vl, double *vlw,
 +      double *alpha, double *beta, double *dsigma, int *idx, int *idxp,
 +      int *idxq, int *perm, int *givptr, int *givcol, int *ldgcol, 
 +      double *givnum, int *ldgnum, double *c, double *s, int *info);
 +
 +void
 +F77_FUNC(dlas2,DLAS2)(double *f, double *g, double *h, double *ssmin, double *ssmax);
 +
 +void
 +F77_FUNC(dlarfg,DLARFG)(int *n, double *alpha, double *x, int *incx, double *tau);
 +
 +void
 +F77_FUNC(dlagtf,DLAGTF)(int *n, double *a, double *lambda, double *b, double *c, 
 +      double *tol, double *d, int *in, int *info);
 +
 +void 
 +F77_FUNC(dgeqrf,DGEQRF)(int *m, int *n, double *a, int *lda, double *tau,
 +      double *work, int *lwork, int *info);
 +
 +
 +
 +/* Single precision */
 +
 +void
 +F77_FUNC(sbdsdc,SBDSDC)(const char *uplo, const char *compq, int *n, float *d, float *e, float *u, 
 +      int *ldu, float *vt, int *ldvt, float *q, int *iq, float *work, 
 +      int *iwork, int *info);
 +
 +void
 +F77_FUNC(sgetf2,SGETF2)(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
 +
 +void
 +F77_FUNC(slamrg,SLAMRG)(int *n1, int *n2, float *a, int *dtrd1, int *dtrd2, int *index);
 +
 +void
 +F77_FUNC(slarnv,SLARNV)(int *idist, int *iseed, int *n, float *x);
 +
 +void 
 +F77_FUNC(slasd0,SLASD0)(int *n, int *sqre, float *d, float *e, float *u, 
 +      int *ldu, float *vt, int *ldvt, int *smlsiz, int *iwork, 
 +      float *work, int *info);
 +
 +void 
 +F77_FUNC(slasda,SLASDA)(int *icompq, int *smlsiz, int *n, int *sqre, float *d, float *e, 
 +      float *u, int *ldu, float *vt, int *k, float *difl, float *difr, 
 +      float *z, float *poles, int *givptr, int *givcol, int *ldgcol, 
 +      int *perm, float *givnum, float *c, float *s, 
 +      float *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(slasq6,SLASQ6)(int *i0, int *n0, float *z, int *pp, float *dmin, float *dmin1, 
 +      float *dmin2, float *dn, float *dnm1, float *dnm2);
 +
 +void
 +F77_FUNC(sorgl2,SORGL2)(int *m,       int *n, int *k, float *a, int *lda, 
 +      float *tau, float *work, int *info);
 +
 +void
 +F77_FUNC(sbdsqr,SBDSQR)(const char *uplo, int *n, int *ncvt, int *nru, int *ncc, float *d, 
 +      float *e, float *vt, int *ldvt, float *u, int *ldu,
 +      float *c, int *ldc, float *work, int *info);
 +
 +void
 +F77_FUNC(sgetrf,SGETRF)(int *m,       int *n, float *a, int *lda, int *ipiv, int *info);
 +
 +void
 +F77_FUNC(sgetri,SGETRI)(int *n,       float *a, int *lda, int *ipiv, float *work, 
 +      int *lwork, int *info);
 +
 +void
 +F77_FUNC(sgetrs,SGETRS)(const char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv,
 +      float *b, int *ldb, int *info);
 +
 +void
 +F77_FUNC(strtri,STRTRI)(const char *uplo, const char *diag, int *n, float *a, int *lda, int *info);
 +
 +void
 +F77_FUNC(strti2,STRTI2)(const char *uplo, const char *diag, int *n, float *a, int *lda, int *info);
 +
 +float
 +F77_FUNC(slange,SLANGE)(const char *norm, int *m, int *n, float *a, int *lda, float *work);
 +
 +void
 +F77_FUNC(slarrbx,SLARRBX)(int *n, float *d, float *l, float *ld, float *lld, int *ifirst,
 +       int *ilast, float *rtol1, float *rtol2, int *offset, float *w,
 +       float *wgap, float *werr, float *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(slasd1,SLASD1)(int *nl, int *nr, int *sqre, float *d, float *alpha, float *beta, 
 +      float *u, int *ldu, float *vt, int *ldvt, int *idxq, int *iwork, 
 +      float *work, int *info);
 +
 +void
 +F77_FUNC(slasdq,SLASDQ)(const char *uplo, int *sqre, int *n, int *ncvt, int *nru, int *ncc,
 +      float *d, float *e, float *vt, int *ldvt, float *u, int *ldu, 
 +      float *c, int *ldc, float *work, int *info);
 +
 +void 
 +F77_FUNC(slasr,SLASR)(const char *side, const char *pivot, const char *direct, int *m, int *n, float *c, 
 +       float *s, float *a, int *lda);
 +
 +void 
 +F77_FUNC(sorglq,SORGLQ)(int *m, int *n, int *k, float *a, int *lda, 
 +      float *tau, float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(sormtr,SORMTR)(const char *side, const char *uplo, const char *trans, int *m, int *n, float *a, 
 +      int *lda, float *tau, float *c, int *ldc,
 +      float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(sgebd2,SGEBD2)(int *m, int *n, float *a, int *lda, float *d, float *e,
 +      float *tauq, float *taup, float *work, int *info);
 +
 +void 
 +F77_FUNC(slabrd,SLABRD)(int *m, int *n, int *nb, float *a, int *lda, float *d,
 +      float *e, float *tauq, float *taup, float *x,
 +      int *ldx, float *y, int *ldy);
 +
 +float
 +F77_FUNC(slanst,SLANST)(const char *norm, int *n, float *d, float *e);
 +
 +float
 +F77_FUNC(slansy,SLANSY)(const char *norm, const char *uplo, int *n, float *a, int *lda, float *work);
 +
 +void
 +F77_FUNC(slarrex,SLARREX)(const char *range, int *n, float *vl, float *vu, int *il, int *iu,
 +       float *d, float *e, float *tol, int *nsplit, 
 +       int *isplit, int *m, float *w, int *iblock, int *indexw,
 +       float *gersch, float *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(slasd2,SLASD2)(int *nl, int *nr, int *sqre, int *k, float *d, float *z, 
 +      float *alpha, float *beta, float *u, int *ldu, float *vt, 
 +      int *ldvt, float *dsigma, float *u2, int *ldu2, float *vt2, 
 +      int *ldvt2, int *idxp, int *idx, int *idxc, 
 +      int *idxq, int *coltyp, int *info);
 +
 +void
 +F77_FUNC(slasdt,SLASDT)(int *n, int *lvl, int *nd, int *inode, int *ndiml, 
 +      int *ndimr, int *msub);
 +
 +void 
 +F77_FUNC(slasrt,SLASRT)(const char *id, int *n, float *d, int *info);
 +
 +void
 +F77_FUNC(slasrt2,SLASRT2)(const char *id, int *n, float *d, int *key, int *info);
 +
 +void 
 +F77_FUNC(sorgqr,SORGQR)(int *m, int *n, int *k, float *a, int *lda, float *tau, 
 +      float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(sstebz,SSTEBZ)(const char *range, const char *order, int *n, float *vl, float *vu, 
 +      int *il, int *iu, float *abstol, float *d, float *e, 
 +      int *m, int *nsplit, float *w, int *iblock, int *isplit, 
 +      float *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(sgebrd,SGEBRD)(int *m, int *n, float *a, int *lda, float *d, float *e,
 +      float *tauq, float *taup, float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(slacpy,SLACPY)(const char *uplo, int *m, int *n, float *a, int *lda, float *b, int *ldb);
 +
 +float
 +F77_FUNC(slapy2,SLAPY2)(float * x, float * y);
 +
 +void
 +F77_FUNC(slarrfx,SLARRFX)(int *n, float *d, float *l, float *ld, float *lld, int *ifirst,
 +        int *ilast, float *w, float *sigma, float *dplus, float *lplus,
 +        float *work, int *info);
 +
 +void 
 +F77_FUNC(slasd3,SLASD3)(int *nl, int *nr, int *sqre, int *k, float *d, float *q, int *ldq, 
 +      float *dsigma, float *u, int *ldu, float *u2, int *ldu2, 
 +      float *vt, int *ldvt, float *vt2, int *ldvt2, int *idxc, 
 +      int *ctot, float *z, int *info);
 +
 +void
 +F77_FUNC(slaset,SLASET)(const char *uplo, int *m, int *n, float *alpha, 
 +      float *beta, float *a, int *lda);
 +
 +void
 +F77_FUNC(slassq,SLASSQ)(int *n, float *x, int *incx, float *scale, float *sumsq);
 +
 +void
 +F77_FUNC(sorm2l,SORM2L)(const char *side, const char *trans, int *m, int *n, int *k, float *a, int *lda, 
 +      float *tau, float *c, int *ldc, float *work, int *info);
 +
 +void
 +F77_FUNC(sstegr,SSTEGR)(const char *jobz, const char *range, int *n, float *d, float *e, float *vl, 
 +      float *vu, int *il, int *iu, float *abstol, int *m, float *w, 
 +      float *z, int *ldz, int *isuppz, float *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(sgelq2,SGELQ2)(int *m, int *n, float *a, int *lda, float *tau, float *work, int *info);
 +
 +void
 +F77_FUNC(slae2,SLAE2)(float *a, float *b, float *c, float *rt1, float *rt2);
 +
 +void
 +F77_FUNC(slaev2,SLAEV2)(float *a, float *b, float *c, float *rt1, float *rt2,
 +        float *cs1, float *cs2);
 +
 +void
 +F77_FUNC(slar1vx,SLAR1VX)(int *n, int *b1, int *bn, float *sigma, float *d, float *l, float *ld, 
 +      float *lld, float *eval, float *gersch, float *z, float *ztz, float *mingma, 
 +      int *r, int *isuppz, float *work);
 +
 +void
 +F77_FUNC(slarrvx,SLARRVX)(int *n, float *d, float *l, int *isplit, int *m, float *w, 
 +       int *iblock, int *indexw, float *gersch, float *tol, float *z, int *ldz, 
 +       int *isuppz, float *work, int *iwork, int *info);
 +
 +void 
 +F77_FUNC(slasd4,SLASD4)(int *n, int *i, float *d, float *z, float *delta, 
 +      float *rho, float *sigma, float *work, int *info);
 +
 +void
 +F77_FUNC(slasq1,SLASQ1)(int *n,       float *d, float *e, float *work, int *info);
 +
 +
 +void 
 +F77_FUNC(slasv2,SLASV2)(float *f, float *g, float *h, float *ssmin, float *ssmax, 
 +      float *snr, float *csr, float *snl, float *csl);
 +
 +void 
 +F77_FUNC(sorm2r,SORM2R)(const char *side, const char *trans, int *m, int *n, int *k, float *a, 
 +      int *lda, float *tau, float *c, int *ldc, float *work, int *info);
 +
 +void
 +F77_FUNC(sstein,SSTEIN)(int *n, float *d, float *e, int *m, float *w, int *iblock, int *isplit, 
 +      float *z, int *ldz, float *work, int *iwork, int *ifail, int *info);
 +
 +void
 +F77_FUNC(sgelqf,SGELQF)(int *m,       int *n, float *a, int *lda, float *tau,
 +      float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(slaebz,SLAEBZ)(int *ijob, int *nitmax, int *n, int *mmax, int *minp, int *nbmin,
 +      float *abstol, float *reltol, float *pivmin, float *d, float *e,
 +      float *e2, int *nval, float *ab, float *c, int *mout, int *nab,
 +      float *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(slarf,SLARF)(const char *side, int *m, int *n, float *v, int *incv, float *tau,
 +       float *c, int *ldc, float *work);
 +
 +void
 +F77_FUNC(slartg,SLARTG)(float *f, float *g, float *cs, float *sn, float *r);
 +
 +void 
 +F77_FUNC(slasd5,SLASD5)(int *i, float *d, float *z, float *delta, 
 +      float *rho, float *dsigma, float *work);
 +
 +void 
 +F77_FUNC(slasq2,SLASQ2)(int *n, float *z, int *info);
 +
 +void 
 +F77_FUNC(slasq3,SLASQ3)(int *i0, int *n0, float *z, int *pp, float *dmin, 
 +      float *sigma, float *desig, float *qmax, int *nfail, 
 +      int *iter, int *ndiv, int *ieee);
 +
 +void
 +F77_FUNC(slaswp,SLASWP)(int *n,       float *a, int *lda, int *k1, int *k2, int *ipiv, int *incx);
 +
 +void 
 +F77_FUNC(sormbr,SORMBR)(const char *vect, const char *side, const char *trans, int *m, int *n, int *k, 
 +      float *a, int *lda, float *tau, float *c, int *ldc, float *work,
 +      int *lwork, int *info);
 +
 +void
 +F77_FUNC(ssterf,SSTERF)(int *n, float *d, float *e, int *info);
 +
 +void
 +F77_FUNC(sgeqr2,SGEQR2)(int *m,       int *n, float *a, int *lda, float *tau, 
 +      float *work, int *info);
 +
 +void 
 +F77_FUNC(slaed6,SLAED6)(int *kniter, int *orgati, float *rho, float *d, 
 +      float *z, float *finit, float *tau, int *info);
 +
 +void 
 +F77_FUNC(slarfb,SLARFB)(const char *side, const char *trans, const char *direct, const char *storev, int *m, int *n, 
 +      int *k, float *v, int *ldv, float *t, int *ldt, float *c,
 +      int *ldc, float *work, int *ldwork);
 +
 +void
 +F77_FUNC(slaruv,SLARUV)(int *iseed, int *n, float *x);
 +
 +void 
 +F77_FUNC(slasd6,SLASD6)(int *icompq, int *nl, int *nr, int *sqre, float *d, float *vf, 
 +      float *vl, float *alpha, float *beta, int *idxq, int *perm, 
 +      int *givptr, int *givcol, int *ldgcol, float *givnum, int *ldgnum, 
 +      float *poles, float *difl, float *difr, float *z, int *k, 
 +      float *c, float *s, float *work, int *iwork, int *info);
 +
 +void
 +F77_FUNC(slatrd,SLATRD)(const char *uplo, int *n, int *nb, float *a, int *lda, float *e, 
 +      float * tau, float *w, int *ldw);
 +
 +void
 +F77_FUNC(sorml2,SORML2)(const char *side, const char *trans, int *m, int *n, int *k, float *a,
 +      int *lda, float *tau, float *c, int *ldc, float *work, int *info);
 +
 +void
 +F77_FUNC(sstevr,SSTEVR)(const char *jobz, const char *range, int *n, float *d, float *e, float *vl, 
 +      float *vu, int *il, int *iu, float *abstol, int *m, float *w, 
 +      float *z, int *ldz, int *isuppz, float *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(ssytrd,SSYTRD)(const char *uplo, int *n, float *  a, int *lda, float *d, 
 +      float *e, float *tau, float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(ssyevr,SSYEVR)(const char *jobz, const char *range, const char *uplo, int *n, 
 +      float *a, int *lda, float *vl, float *vu, int *
 +      il, int *iu, float *abstol, int *m, float *w, 
 +      float *z__, int *ldz, int *isuppz, float *work, 
 +      int *lwork, int *iwork, int *liwork, int *info);
 +
 +void
 +F77_FUNC(sormql,SORMQL)(const char *side, const char *trans, int *m, int *n, 
 +      int *k, float *a, int *lda, float *tau, float *
 +      c, int *ldc, float *work, int *lwork, int *info);
 +
 +void 
 +F77_FUNC(sormqr,SORMQR)(const char *side, const char *trans, int *m, int *n, int *k, float *a, 
 +        int *lda, float *tau, float *c, int *ldc, 
 +        float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(sorgbr,SORGBR)(const char *vect, int *m, int *n, int *k, float *a, int *lda,
 +      float *tau, float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(slasq5,SLASQ5)(int *i0, int *n0, float *z, int *pp, float *tau, float *dmin, 
 +      float *dmin1, float *dmin2, float *dn, float *dnm1, 
 +      float *dnm2, int *ieee);
 +
 +void 
 +F77_FUNC(slasd8,SLASD8)(int *icompq, int *k, float *d, float *z, float *vf, float *vl, 
 +      float *difl, float *difr, int *lddifr, float *dsigma, 
 +      float *work, int *info);
 +
 +void
 +F77_FUNC(slascl,SLASCL)(const char *type, int *kl, int *ku, float *cfrom, float *cto, int *m, 
 +      int *n, float *a, int *lda, int *info);
 +
 +void 
 +F77_FUNC(slarft,SLARFT)(const char *direct, const char *storev, int *n, int *k, float *v, 
 +      int *ldv, float *tau, float *t, int *ldt);
 +
 +void
 +F77_FUNC(slagts,SLAGTS)(int *job, int *n, float *a, float *b, float *c, float *d, 
 +      int *in, float *y, float *tol, int *info);
 +
 +void 
 +F77_FUNC(sgesdd,SGESDD)(const char *jobz, int *m, int *n, float *a, int *lda, float *s, float *u, 
 +      int *ldu, float *vt, int *ldvt, float *work, int *lwork, 
 +      int *iwork, int *info);
 +
 +void
 +F77_FUNC(ssytd2,SSYTD2)(const char *uplo, int *n, float *a, int *lda, float *d, 
 +      float *e, float *tau, int *info);
 +
 +void 
 +F77_FUNC(sormlq,SORMLQ)(const char *side, const char *trans, int *m, int *n, int *k, float *a, int *lda, 
 +      float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
 +
 +void
 +F77_FUNC(sorg2r,SORG2R)(int *m, int *n,       int *k, float *a, int *lda, float *tau,
 +      float *work, int *info);
 +
 +void 
 +F77_FUNC(slasq4,SLASQ4)(int *i0, int *n0, float *z, int *pp, int *n0in, float *dmin, 
 +      float *dmin1, float *dmin2, float *dn, float *dn1, 
 +      float *dn2, float *tau, int *ttype);
 +
 +void 
 +F77_FUNC(slasd7,SLASD7)(int *icompq, int *nl, int *nr, int *sqre, int *k, float *d, float *z,
 +      float *zw, float *vf, float *vfw, float *vl, float *vlw,
 +      float *alpha, float *beta, float *dsigma, int *idx, int *idxp,
 +      int *idxq, int *perm, int *givptr, int *givcol, int *ldgcol, 
 +      float *givnum, int *ldgnum, float *c, float *s, int *info);
 +
 +void
 +F77_FUNC(slas2,SLAS2)(float *f, float *g, float *h, float *ssmin, float *ssmax);
 +
 +void
 +F77_FUNC(slarfg,SLARFG)(int *n, float *alpha, float *x, int *incx, float *tau);
 +
 +void
 +F77_FUNC(slagtf,SLAGTF)(int *n, float *a, float *lambda, float *b, float *c, 
 +      float *tol, float *d, int *in, int *info);
 +
 +void 
 +F77_FUNC(sgeqrf,SGEQRF)(int *m, int *n, float *a, int *lda, float *tau,
 +      float *work, int *lwork, int *info);
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +/*! \endcond */
 +
 +#endif /* GMX_LAPACK_H */
index b0fe96ea04934c74952588f8ffdcd4dae964295e,0000000000000000000000000000000000000000..1632690ebe01dd6e78482389744edc61459c3867
mode 100644,000000..100644
--- /dev/null
@@@ -1,1670 -1,0 +1,1668 @@@
-     /* signal we are returning if nothing is going to be done in this routine */
-     if ((trotter_seq[0] == etrtSKIPALL)  || !(bCouple))
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "update.h"
 +#include "vec.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "nrnb.h"
 +#include "gmx_random.h"
 +#include "update.h"
 +#include "mdrun.h"
 +
 +#define NTROTTERPARTS 3
 +
 +/* Suzuki-Yoshida Constants, for n=3 and n=5, for symplectic integration  */
 +/* for n=1, w0 = 1 */
 +/* for n=3, w0 = w2 = 1/(2-2^-(1/3)), w1 = 1-2*w0 */
 +/* for n=5, w0 = w1 = w3 = w4 = 1/(4-4^-(1/3)), w1 = 1-4*w0 */
 +
 +#define MAX_SUZUKI_YOSHIDA_NUM 5
 +#define SUZUKI_YOSHIDA_NUM  5
 +
 +static const double sy_const_1[] = { 1. };
 +static const double sy_const_3[] = { 0.828981543588751,-0.657963087177502,0.828981543588751 };
 +static const double sy_const_5[] = { 0.2967324292201065,0.2967324292201065,-0.186929716880426,0.2967324292201065,0.2967324292201065 };
 +
 +static const double* sy_const[] = {
 +    NULL,
 +    sy_const_1,
 +    NULL,
 +    sy_const_3,
 +    NULL,
 +    sy_const_5
 +};
 +
 +/*
 +static const double sy_const[MAX_SUZUKI_YOSHIDA_NUM+1][MAX_SUZUKI_YOSHIDA_NUM+1] = {
 +    {},
 +    {1},
 +    {},
 +    {0.828981543588751,-0.657963087177502,0.828981543588751},
 +    {},
 +    {0.2967324292201065,0.2967324292201065,-0.186929716880426,0.2967324292201065,0.2967324292201065}
 +};*/
 +
 +/* these integration routines are only referenced inside this file */
 +static void NHC_trotter(t_grpopts *opts,int nvar, gmx_ekindata_t *ekind,real dtfull,
 +                        double xi[],double vxi[], double scalefac[], real *veta, t_extmass *MassQ, gmx_bool bEkinAveVel)
 +
 +{
 +    /* general routine for both barostat and thermostat nose hoover chains */
 +
 +    int   i,j,mi,mj,jmax;
 +    double Ekin,Efac,reft,kT,nd;
 +    double dt;
 +    t_grp_tcstat *tcstat;
 +    double *ivxi,*ixi;
 +    double *iQinv;
 +    double *GQ;
 +    gmx_bool bBarostat;
 +    int mstepsi, mstepsj;
 +    int ns = SUZUKI_YOSHIDA_NUM;  /* set the degree of integration in the types/state.h file */
 +    int nh = opts->nhchainlength;
 +    
 +    snew(GQ,nh);
 +    mstepsi = mstepsj = ns;
 +
 +/* if scalefac is NULL, we are doing the NHC of the barostat */
 +    
 +    bBarostat = FALSE;
 +    if (scalefac == NULL) {
 +        bBarostat = TRUE;
 +    }
 +
 +    for (i=0; i<nvar; i++) 
 +    {
 +
 +        /* make it easier to iterate by selecting 
 +           out the sub-array that corresponds to this T group */
 +        
 +        ivxi = &vxi[i*nh];
 +        ixi = &xi[i*nh];
 +        if (bBarostat) {
 +            iQinv = &(MassQ->QPinv[i*nh]); 
 +            nd = 1.0; /* THIS WILL CHANGE IF NOT ISOTROPIC */
 +            reft = max(0.0,opts->ref_t[0]);
 +            Ekin = sqr(*veta)/MassQ->Winv;
 +        } else {
 +            iQinv = &(MassQ->Qinv[i*nh]);  
 +            tcstat = &ekind->tcstat[i];
 +            nd = opts->nrdf[i];
 +            reft = max(0.0,opts->ref_t[i]);
 +            if (bEkinAveVel) 
 +            {
 +                Ekin = 2*trace(tcstat->ekinf)*tcstat->ekinscalef_nhc;
 +            } else {
 +                Ekin = 2*trace(tcstat->ekinh)*tcstat->ekinscaleh_nhc;
 +            }
 +        }
 +        kT = BOLTZ*reft;
 +
 +        for(mi=0;mi<mstepsi;mi++) 
 +        {
 +            for(mj=0;mj<mstepsj;mj++)
 +            { 
 +                /* weighting for this step using Suzuki-Yoshida integration - fixed at 5 */
 +                dt = sy_const[ns][mj] * dtfull / mstepsi;
 +                
 +                /* compute the thermal forces */
 +                GQ[0] = iQinv[0]*(Ekin - nd*kT);
 +                
 +                for (j=0;j<nh-1;j++) 
 +                {     
 +                    if (iQinv[j+1] > 0) {
 +                        /* we actually don't need to update here if we save the 
 +                           state of the GQ, but it's easier to just recompute*/
 +                        GQ[j+1] = iQinv[j+1]*((sqr(ivxi[j])/iQinv[j])-kT);      
 +                    } else {
 +                        GQ[j+1] = 0;
 +                    }
 +                }
 +                
 +                ivxi[nh-1] += 0.25*dt*GQ[nh-1];
 +                for (j=nh-1;j>0;j--) 
 +                { 
 +                    Efac = exp(-0.125*dt*ivxi[j]);
 +                    ivxi[j-1] = Efac*(ivxi[j-1]*Efac + 0.25*dt*GQ[j-1]);
 +                }
 +                
 +                Efac = exp(-0.5*dt*ivxi[0]);
 +                if (bBarostat) {
 +                    *veta *= Efac;                
 +                } else {
 +                    scalefac[i] *= Efac;
 +                }
 +                Ekin *= (Efac*Efac);
 +                
 +                /* Issue - if the KE is an average of the last and the current temperatures, then we might not be
 +                   able to scale the kinetic energy directly with this factor.  Might take more bookkeeping -- have to
 +                   think about this a bit more . . . */
 +
 +                GQ[0] = iQinv[0]*(Ekin - nd*kT);
 +                
 +                /* update thermostat positions */
 +                for (j=0;j<nh;j++) 
 +                { 
 +                    ixi[j] += 0.5*dt*ivxi[j];
 +                }
 +                
 +                for (j=0;j<nh-1;j++) 
 +                { 
 +                    Efac = exp(-0.125*dt*ivxi[j+1]);
 +                    ivxi[j] = Efac*(ivxi[j]*Efac + 0.25*dt*GQ[j]);
 +                    if (iQinv[j+1] > 0) {
 +                        GQ[j+1] = iQinv[j+1]*((sqr(ivxi[j])/iQinv[j])-kT);  
 +                    } else {
 +                        GQ[j+1] = 0;
 +                    }
 +                }
 +                ivxi[nh-1] += 0.25*dt*GQ[nh-1];
 +            }
 +        }
 +    }
 +    sfree(GQ);
 +}
 +
 +static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box, 
 +                         gmx_ekindata_t *ekind, tensor vir, real pcorr, t_extmass *MassQ)
 +{
 +
 +    real  pscal;
 +    double alpha;
 +    int   i,j,d,n,nwall;
 +    real  T,GW,vol;
 +    tensor Winvm,ekinmod,localpres;
 +    
 +    /* The heat bath is coupled to a separate barostat, the last temperature group.  In the 
 +       2006 Tuckerman et al paper., the order is iL_{T_baro} iL {T_part}
 +    */
 +    
 +    if (ir->epct==epctSEMIISOTROPIC) 
 +    {
 +        nwall = 2;
 +    } 
 +    else 
 +    {
 +        nwall = 3;
 +    }
 +
 +    /* eta is in pure units.  veta is in units of ps^-1. GW is in 
 +       units of ps^-2.  However, eta has a reference of 1 nm^3, so care must be 
 +       taken to use only RATIOS of eta in updating the volume. */
 +    
 +    /* we take the partial pressure tensors, modify the 
 +       kinetic energy tensor, and recovert to pressure */
 +    
 +    if (ir->opts.nrdf[0]==0) 
 +    { 
 +        gmx_fatal(FARGS,"Barostat is coupled to a T-group with no degrees of freedom\n");    
 +    } 
 +    /* alpha factor for phase space volume, then multiply by the ekin scaling factor.  */
 +    alpha = 1.0 + DIM/((double)ir->opts.nrdf[0]);
 +    alpha *= ekind->tcstat[0].ekinscalef_nhc;
 +    msmul(ekind->ekin,alpha,ekinmod);  
 +    /* for now, we use Elr = 0, because if you want to get it right, you
 +       really should be using PME. Maybe print a warning? */
 +
 +    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres)+pcorr;
 +
 +    vol = det(box);
 +    GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p));   /* W is in ps^2 * bar * nm^3 */
 +
 +    *veta += 0.5*dt*GW;   
 +}
 +
 +/* 
 + * This file implements temperature and pressure coupling algorithms:
 + * For now only the Weak coupling and the modified weak coupling.
 + *
 + * Furthermore computation of pressure and temperature is done here
 + *
 + */
 +
 +real calc_pres(int ePBC,int nwall,matrix box,tensor ekin,tensor vir,
 +               tensor pres)
 +{
 +    int  n,m;
 +    real fac;
 +    
 +    if (ePBC==epbcNONE || (ePBC==epbcXY && nwall!=2))
 +        clear_mat(pres);
 +    else {
 +        /* Uitzoeken welke ekin hier van toepassing is, zie Evans & Morris - E. 
 +         * Wrs. moet de druktensor gecorrigeerd worden voor de netto stroom in  
 +         * het systeem...       
 +         */
 +        
 +        fac=PRESFAC*2.0/det(box);
 +        for(n=0; (n<DIM); n++)
 +            for(m=0; (m<DIM); m++)
 +                pres[n][m] = (ekin[n][m] - vir[n][m])*fac;
 +        
 +        if (debug) {
 +            pr_rvecs(debug,0,"PC: pres",pres,DIM);
 +            pr_rvecs(debug,0,"PC: ekin",ekin,DIM);
 +            pr_rvecs(debug,0,"PC: vir ",vir, DIM);
 +            pr_rvecs(debug,0,"PC: box ",box, DIM);
 +        }
 +    }
 +    return trace(pres)/DIM;
 +}
 +
 +real calc_temp(real ekin,real nrdf)
 +{
 +    if (nrdf > 0)
 +        return (2.0*ekin)/(nrdf*BOLTZ);
 +    else
 +        return 0;
 +}
 +
 +void parrinellorahman_pcoupl(FILE *fplog,gmx_large_int_t step,
 +                           t_inputrec *ir,real dt,tensor pres,
 +                           tensor box,tensor box_rel,tensor boxv,
 +                           tensor M,matrix mu,gmx_bool bFirstStep)
 +{
 +  /* This doesn't do any coordinate updating. It just
 +   * integrates the box vector equations from the calculated
 +   * acceleration due to pressure difference. We also compute
 +   * the tensor M which is used in update to couple the particle
 +   * coordinates to the box vectors.
 +   *
 +   * In Nose and Klein (Mol.Phys 50 (1983) no 5., p 1055) this is
 +   * given as
 +   *            -1    .           .     -1
 +   * M_nk = (h')   * (h' * h + h' h) * h
 +   *
 +   * with the dots denoting time derivatives and h is the transformation from
 +   * the scaled frame to the real frame, i.e. the TRANSPOSE of the box. 
 +   * This also goes for the pressure and M tensors - they are transposed relative
 +   * to ours. Our equation thus becomes:
 +   *
 +   *                  -1       .    .           -1
 +   * M_gmx = M_nk' = b  * (b * b' + b * b') * b'
 +   * 
 +   * where b is the gromacs box matrix.                       
 +   * Our box accelerations are given by
 +   *   ..                                    ..
 +   *   b = vol/W inv(box') * (P-ref_P)     (=h')
 +   */
 +  
 +  int    d,n;
 +  tensor winv;
 +  real   vol=box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +  real   atot,arel,change,maxchange,xy_pressure;
 +  tensor invbox,pdiff,t1,t2;
 +
 +  real maxl;
 +
 +  m_inv_ur0(box,invbox);
 +
 +  if (!bFirstStep) {
 +    /* Note that PRESFAC does not occur here.
 +     * The pressure and compressibility always occur as a product,
 +     * therefore the pressure unit drops out.
 +     */
 +    maxl=max(box[XX][XX],box[YY][YY]);
 +    maxl=max(maxl,box[ZZ][ZZ]);
 +    for(d=0;d<DIM;d++)
 +      for(n=0;n<DIM;n++)
 +      winv[d][n]=
 +        (4*M_PI*M_PI*ir->compress[d][n])/(3*ir->tau_p*ir->tau_p*maxl);
 +    
 +    m_sub(pres,ir->ref_p,pdiff);
 +    
 +    if(ir->epct==epctSURFACETENSION) {
 +      /* Unlike Berendsen coupling it might not be trivial to include a z
 +       * pressure correction here? On the other hand we don't scale the
 +       * box momentarily, but change accelerations, so it might not be crucial.
 +       */
 +      xy_pressure=0.5*(pres[XX][XX]+pres[YY][YY]);
 +      for(d=0;d<ZZ;d++)
 +      pdiff[d][d]=(xy_pressure-(pres[ZZ][ZZ]-ir->ref_p[d][d]/box[d][d]));
 +    }
 +    
 +    tmmul(invbox,pdiff,t1);
 +    /* Move the off-diagonal elements of the 'force' to one side to ensure
 +     * that we obey the box constraints.
 +     */
 +    for(d=0;d<DIM;d++) {
 +      for(n=0;n<d;n++) {
 +      t1[d][n] += t1[n][d];
 +      t1[n][d] = 0;
 +      }
 +    }
 +    
 +    switch (ir->epct) {
 +    case epctANISOTROPIC:
 +      for(d=0;d<DIM;d++) 
 +      for(n=0;n<=d;n++)
 +        t1[d][n] *= winv[d][n]*vol;
 +      break;
 +    case epctISOTROPIC:
 +      /* calculate total volume acceleration */
 +      atot=box[XX][XX]*box[YY][YY]*t1[ZZ][ZZ]+
 +      box[XX][XX]*t1[YY][YY]*box[ZZ][ZZ]+
 +      t1[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +      arel=atot/(3*vol);
 +      /* set all RELATIVE box accelerations equal, and maintain total V
 +       * change speed */
 +      for(d=0;d<DIM;d++)
 +      for(n=0;n<=d;n++)
 +        t1[d][n] = winv[0][0]*vol*arel*box[d][n];    
 +      break;
 +    case epctSEMIISOTROPIC:
 +    case epctSURFACETENSION:
 +      /* Note the correction to pdiff above for surftens. coupling  */
 +      
 +      /* calculate total XY volume acceleration */
 +      atot=box[XX][XX]*t1[YY][YY]+t1[XX][XX]*box[YY][YY];
 +      arel=atot/(2*box[XX][XX]*box[YY][YY]);
 +      /* set RELATIVE XY box accelerations equal, and maintain total V
 +       * change speed. Dont change the third box vector accelerations */
 +      for(d=0;d<ZZ;d++)
 +      for(n=0;n<=d;n++)
 +        t1[d][n] = winv[d][n]*vol*arel*box[d][n];
 +      for(n=0;n<DIM;n++)
 +      t1[ZZ][n] *= winv[d][n]*vol;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Parrinello-Rahman pressure coupling type %s "
 +                "not supported yet\n",EPCOUPLTYPETYPE(ir->epct));
 +      break;
 +    }
 +    
 +    maxchange=0;
 +    for(d=0;d<DIM;d++)
 +      for(n=0;n<=d;n++) {
 +      boxv[d][n] += dt*t1[d][n];
 +      
 +      /* We do NOT update the box vectors themselves here, since
 +       * we need them for shifting later. It is instead done last
 +       * in the update() routine.
 +       */
 +      
 +      /* Calculate the change relative to diagonal elements-
 +         since it's perfectly ok for the off-diagonal ones to
 +         be zero it doesn't make sense to check the change relative
 +         to its current size.
 +      */
 +      
 +      change=fabs(dt*boxv[d][n]/box[d][d]);
 +      
 +      if (change>maxchange)
 +        maxchange=change;
 +      }
 +    
 +    if (maxchange > 0.01 && fplog) {
 +      char buf[22];
 +      fprintf(fplog,
 +              "\nStep %s  Warning: Pressure scaling more than 1%%. "
 +              "This may mean your system\n is not yet equilibrated. "
 +              "Use of Parrinello-Rahman pressure coupling during\n"
 +              "equilibration can lead to simulation instability, "
 +              "and is discouraged.\n",
 +            gmx_step_str(step,buf));
 +    }
 +  }
 +  
 +  preserve_box_shape(ir,box_rel,boxv);
 +
 +  mtmul(boxv,box,t1);       /* t1=boxv * b' */
 +  mmul(invbox,t1,t2);
 +  mtmul(t2,invbox,M);
 +
 +  /* Determine the scaling matrix mu for the coordinates */
 +  for(d=0;d<DIM;d++)
 +    for(n=0;n<=d;n++)
 +      t1[d][n] = box[d][n] + dt*boxv[d][n];
 +  preserve_box_shape(ir,box_rel,t1);
 +  /* t1 is the box at t+dt, determine mu as the relative change */
 +  mmul_ur0(invbox,t1,mu);
 +}
 +
 +void berendsen_pcoupl(FILE *fplog,gmx_large_int_t step, 
 +                    t_inputrec *ir,real dt, tensor pres,matrix box,
 +                    matrix mu)
 +{
 +  int    d,n;
 +  real   scalar_pressure, xy_pressure, p_corr_z;
 +  char   *ptr,buf[STRLEN];
 +
 +  /*
 +   *  Calculate the scaling matrix mu
 +   */
 +  scalar_pressure=0;
 +  xy_pressure=0;
 +  for(d=0; d<DIM; d++) {
 +    scalar_pressure += pres[d][d]/DIM;
 +    if (d != ZZ)
 +      xy_pressure += pres[d][d]/(DIM-1);
 +  }
 +  /* Pressure is now in bar, everywhere. */
 +#define factor(d,m) (ir->compress[d][m]*dt/ir->tau_p)
 +  
 +  /* mu has been changed from pow(1+...,1/3) to 1+.../3, since this is
 +   * necessary for triclinic scaling
 +   */
 +  clear_mat(mu);
 +  switch (ir->epct) {
 +  case epctISOTROPIC:
 +    for(d=0; d<DIM; d++) 
 +      {
 +      mu[d][d] = 1.0 - factor(d,d)*(ir->ref_p[d][d] - scalar_pressure) /DIM;
 +      }
 +    break;
 +  case epctSEMIISOTROPIC:
 +    for(d=0; d<ZZ; d++)
 +      mu[d][d] = 1.0 - factor(d,d)*(ir->ref_p[d][d]-xy_pressure)/DIM;
 +    mu[ZZ][ZZ] = 
 +      1.0 - factor(ZZ,ZZ)*(ir->ref_p[ZZ][ZZ] - pres[ZZ][ZZ])/DIM;
 +    break;
 +  case epctANISOTROPIC:
 +    for(d=0; d<DIM; d++)
 +      for(n=0; n<DIM; n++)
 +      mu[d][n] = (d==n ? 1.0 : 0.0) 
 +        -factor(d,n)*(ir->ref_p[d][n] - pres[d][n])/DIM;
 +    break;
 +  case epctSURFACETENSION:
 +    /* ir->ref_p[0/1] is the reference surface-tension times *
 +     * the number of surfaces                                */
 +    if (ir->compress[ZZ][ZZ])
 +      p_corr_z = dt/ir->tau_p*(ir->ref_p[ZZ][ZZ] - pres[ZZ][ZZ]);
 +    else
 +      /* when the compressibity is zero, set the pressure correction   *
 +       * in the z-direction to zero to get the correct surface tension */
 +      p_corr_z = 0;
 +    mu[ZZ][ZZ] = 1.0 - ir->compress[ZZ][ZZ]*p_corr_z;
 +    for(d=0; d<DIM-1; d++)
 +      mu[d][d] = 1.0 + factor(d,d)*(ir->ref_p[d][d]/(mu[ZZ][ZZ]*box[ZZ][ZZ])
 +                                  - (pres[ZZ][ZZ]+p_corr_z - xy_pressure))/(DIM-1);
 +    break;
 +  default:
 +    gmx_fatal(FARGS,"Berendsen pressure coupling type %s not supported yet\n",
 +              EPCOUPLTYPETYPE(ir->epct));
 +    break;
 +  }
 +  /* To fullfill the orientation restrictions on triclinic boxes
 +   * we will set mu_yx, mu_zx and mu_zy to 0 and correct
 +   * the other elements of mu to first order.
 +   */
 +  mu[YY][XX] += mu[XX][YY];
 +  mu[ZZ][XX] += mu[XX][ZZ];
 +  mu[ZZ][YY] += mu[YY][ZZ];
 +  mu[XX][YY] = 0;
 +  mu[XX][ZZ] = 0;
 +  mu[YY][ZZ] = 0;
 +
 +  if (debug) {
 +    pr_rvecs(debug,0,"PC: pres ",pres,3);
 +    pr_rvecs(debug,0,"PC: mu   ",mu,3);
 +  }
 +  
 +  if (mu[XX][XX]<0.99 || mu[XX][XX]>1.01 ||
 +      mu[YY][YY]<0.99 || mu[YY][YY]>1.01 ||
 +      mu[ZZ][ZZ]<0.99 || mu[ZZ][ZZ]>1.01) {
 +    char buf2[22];
 +    sprintf(buf,"\nStep %s  Warning: pressure scaling more than 1%%, "
 +          "mu: %g %g %g\n",
 +          gmx_step_str(step,buf2),mu[XX][XX],mu[YY][YY],mu[ZZ][ZZ]);
 +    if (fplog)
 +      fprintf(fplog,"%s",buf);
 +    fprintf(stderr,"%s",buf);
 +  }
 +}
 +
 +void berendsen_pscale(t_inputrec *ir,matrix mu,
 +                    matrix box,matrix box_rel,
 +                    int start,int nr_atoms,
 +                    rvec x[],unsigned short cFREEZE[],
 +                    t_nrnb *nrnb)
 +{
 +  ivec   *nFreeze=ir->opts.nFreeze;
 +  int    n,d,g=0;
 +      
 +  /* Scale the positions */
 +  for (n=start; n<start+nr_atoms; n++) {
 +    if (cFREEZE)
 +      g = cFREEZE[n];
 +    
 +    if (!nFreeze[g][XX])
 +      x[n][XX] = mu[XX][XX]*x[n][XX]+mu[YY][XX]*x[n][YY]+mu[ZZ][XX]*x[n][ZZ];
 +    if (!nFreeze[g][YY])
 +      x[n][YY] = mu[YY][YY]*x[n][YY]+mu[ZZ][YY]*x[n][ZZ];
 +    if (!nFreeze[g][ZZ])
 +      x[n][ZZ] = mu[ZZ][ZZ]*x[n][ZZ];
 +  }
 +  /* compute final boxlengths */
 +  for (d=0; d<DIM; d++) {
 +    box[d][XX] = mu[XX][XX]*box[d][XX]+mu[YY][XX]*box[d][YY]+mu[ZZ][XX]*box[d][ZZ];
 +    box[d][YY] = mu[YY][YY]*box[d][YY]+mu[ZZ][YY]*box[d][ZZ];
 +    box[d][ZZ] = mu[ZZ][ZZ]*box[d][ZZ];
 +  }      
 +
 +  preserve_box_shape(ir,box_rel,box);
 +  
 +  /* (un)shifting should NOT be done after this,
 +   * since the box vectors might have changed
 +   */
 +  inc_nrnb(nrnb,eNR_PCOUPL,nr_atoms);
 +}
 +
 +void berendsen_tcoupl(t_inputrec *ir,gmx_ekindata_t *ekind,real dt)
 +{
 +    t_grpopts *opts;
 +    int    i;
 +    real   T,reft=0,lll;
 +
 +    opts = &ir->opts;
 +
 +    for(i=0; (i<opts->ngtc); i++)
 +    {
 +        if (ir->eI == eiVV)
 +        {
 +            T = ekind->tcstat[i].T;
 +        }
 +        else
 +        {
 +            T = ekind->tcstat[i].Th;
 +        }
 +
 +        if ((opts->tau_t[i] > 0) && (T > 0.0)) {  
 +            reft = max(0.0,opts->ref_t[i]);
 +            lll  = sqrt(1.0 + (dt/opts->tau_t[i])*(reft/T-1.0));
 +            ekind->tcstat[i].lambda = max(min(lll,1.25),0.8);
 +        }
 +        else {
 +            ekind->tcstat[i].lambda = 1.0;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"TC: group %d: T: %g, Lambda: %g\n",
 +                    i,T,ekind->tcstat[i].lambda);
 +        }
 +    }
 +}
 +
 +static int poisson_variate(real lambda,gmx_rng_t rng) {
 +
 +    real L;
 +    int k=0;
 +    real p=1.0;
 +
 +    L = exp(-lambda);
 +
 +    do
 +    {
 +        k = k+1;
 +        p *= gmx_rng_uniform_real(rng);
 +    } while (p>L);
 +
 +    return k-1;
 +}
 +
 +void andersen_tcoupl(t_inputrec *ir,t_mdatoms *md,t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock,gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac)
 +{
 +    t_grpopts *opts;
 +    int    i,j,k,d,len,n,ngtc,gc=0;
 +    int    nshake, nsettle, nrandom, nrand_group;
 +    real   boltz,scal,reft,prand;
 +    t_iatom *iatoms;
 +
 +    /* convenience variables */
 +    opts = &ir->opts;
 +    ngtc = opts->ngtc;
 +
 +    /* idef is only passed in if it's chance-per-particle andersen, so
 +       it essentially serves as a boolean to determine which type of
 +       andersen is being used */
 +    if (idef) {
 +
 +        /* randomly atoms to randomize.  However, all constraint
 +           groups have to have either all of the atoms or none of the
 +           atoms randomize.
 +
 +           Algorithm:
 +           1. Select whether or not to randomize each atom to get the correct probability.
 +           2. Cycle through the constraint groups.
 +              2a. for each constraint group, determine the fraction f of that constraint group that are
 +                  chosen to be randomized.
 +              2b. all atoms in the constraint group are randomized with probability f.
 +        */
 +
 +        nrandom = 0;
 +        if ((rate < 0.05) && (md->homenr > 50))
 +        {
 +            /* if the rate is relatively high, use a standard method, if low rate,
 +             * use poisson */
 +            /* poisson distributions approxmation, more efficient for
 +             * low rates, fewer random numbers required */
 +            nrandom = poisson_variate(md->homenr*rate,rng);  /* how many do we randomize? Use Poisson. */
 +            /* now we know how many, choose them randomly. No worries about repeats, at this rate, it's negligible.
 +               worst thing that happens, it lowers the true rate an negligible amount */
 +            for (i=0;i<nrandom;i++)
 +            {
 +                randatom[(int)(gmx_rng_uniform_real(rng)*md->homenr)] = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            for (i=0;i<md->homenr;i++)
 +            {
 +                if (gmx_rng_uniform_real(rng)<rate)
 +                {
 +                    randatom[i] = TRUE;
 +                    nrandom++;
 +                }
 +            }
 +        }
 +
 +        /* instead of looping over the constraint groups, if we had a
 +           list of which atoms were in which constraint groups, we
 +           could then loop over only the groups that are randomized
 +           now.  But that is not available now.  Create later after
 +           determining whether there actually is any slowing. */
 +
 +        /* first, loop through the settles to make sure all groups either entirely randomized, or not randomized. */
 +
 +        nsettle  = idef->il[F_SETTLE].nr/2;
 +        for (i=0;i<nsettle;i++)
 +        {
 +            iatoms = idef->il[F_SETTLE].iatoms;
 +            nrand_group = 0;
 +            for (k=0;k<3;k++)  /* settles are always 3 atoms, hardcoded */
 +            {
 +                if (randatom[iatoms[2*i+1]+k])
 +                {
 +                    nrand_group++;     /* count the number of atoms to be shaken in the settles group */
 +                    randatom[iatoms[2*i+1]+k] = FALSE;
 +                    nrandom--;
 +                }
 +            }
 +            if (nrand_group > 0)
 +            {
 +                prand = (nrand_group)/3.0;  /* use this fraction to compute the probability the
 +                                               whole group is randomized */
 +                if (gmx_rng_uniform_real(rng)<prand)
 +                {
 +                    for (k=0;k<3;k++)
 +                    {
 +                        randatom[iatoms[2*i+1]+k] = TRUE;   /* mark them all to be randomized */
 +                    }
 +                    nrandom+=3;
 +                }
 +            }
 +        }
 +
 +        /* now loop through the shake groups */
 +        nshake = nblocks;
 +        for (i=0;i<nshake;i++)
 +        {
 +            iatoms = &(idef->il[F_CONSTR].iatoms[sblock[i]]);
 +            len = sblock[i+1]-sblock[i];
 +            nrand_group = 0;
 +            for (k=0;k<len;k++)
 +            {
 +                if (k%3 != 0)
 +                {  /* only 2/3 of the sblock items are atoms, the others are labels */
 +                    if (randatom[iatoms[k]])
 +                    {
 +                        nrand_group++;
 +                        randatom[iatoms[k]] = FALSE;  /* need to mark it false here in case the atom is in more than
 +                                                         one group in the shake block */
 +                        nrandom--;
 +                    }
 +                }
 +            }
 +            if (nrand_group > 0)
 +            {
 +                prand = (nrand_group)/(1.0*(2*len/3));
 +                if (gmx_rng_uniform_real(rng)<prand)
 +                {
 +                    for (k=0;k<len;k++)
 +                    {
 +                        if (k%3 != 0)
 +                        {  /* only 2/3 of the sblock items are atoms, the others are labels */
 +                            randatom[iatoms[k]] = TRUE; /* randomize all of them */
 +                            nrandom++;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        if (nrandom > 0)
 +        {
 +            n = 0;
 +            for (i=0;i<md->homenr;i++)  /* now loop over the list of atoms */
 +            {
 +                if (randatom[i])
 +                {
 +                    randatom_list[n] = i;
 +                    n++;
 +                }
 +            }
 +            nrandom = n;  /* there are some values of nrandom for which
 +                             this algorithm won't work; for example all
 +                             water molecules and nrandom =/= 3.  Better to
 +                             recount and use this number (which we
 +                             calculate anyway: it will not affect
 +                             the average number of atoms accepted.
 +                          */
 +        }
 +    }
 +    else
 +    {
 +        /* if it's andersen-massive, then randomize all the atoms */
 +        nrandom = md->homenr;
 +        for (i=0;i<nrandom;i++)
 +        {
 +            randatom_list[i] = i;
 +        }
 +    }
 +
 +    /* randomize the velocities of the selected particles */
 +
 +    for (i=0;i<nrandom;i++)  /* now loop over the list of atoms */
 +    {
 +        n = randatom_list[i];
 +        if (md->cTC)
 +        {
 +            gc   = md->cTC[n];  /* assign the atom to a temperature group if there are more than one */
 +        }
 +        if (randomize[gc])
 +        {
 +            scal = sqrt(boltzfac[gc]*md->invmass[n]);
 +            for (d=0;d<DIM;d++)
 +            {
 +                state->v[n][d] = scal*gmx_rng_gaussian_table(rng);
 +            }
 +        }
 +        randatom[n] = FALSE; /* unmark this atom for randomization */
 +    }
 +}
 +
 +
 +void nosehoover_tcoupl(t_grpopts *opts,gmx_ekindata_t *ekind,real dt,
 +                       double xi[],double vxi[], t_extmass *MassQ)
 +{
 +    int   i;
 +    real  reft,oldvxi;
 +    
 +    /* note that this routine does not include Nose-hoover chains yet. Should be easy to add. */
 +    
 +    for(i=0; (i<opts->ngtc); i++)
 +    {
 +        reft = max(0.0,opts->ref_t[i]);
 +        oldvxi = vxi[i];
 +        vxi[i]  += dt*MassQ->Qinv[i]*(ekind->tcstat[i].Th - reft);
 +        xi[i] += dt*(oldvxi + vxi[i])*0.5;
 +    }
 +}
 +
 +t_state *init_bufstate(const t_state *template_state)
 +{
 +    t_state *state;
 +    int nc = template_state->nhchainlength;
 +    snew(state,1);
 +    snew(state->nosehoover_xi,nc*template_state->ngtc);
 +    snew(state->nosehoover_vxi,nc*template_state->ngtc);
 +    snew(state->therm_integral,template_state->ngtc);
 +    snew(state->nhpres_xi,nc*template_state->nnhpres);
 +    snew(state->nhpres_vxi,nc*template_state->nnhpres);
 +
 +    return state;
 +}  
 +
 +void destroy_bufstate(t_state *state) 
 +{
 +    sfree(state->x);
 +    sfree(state->v);
 +    sfree(state->nosehoover_xi);
 +    sfree(state->nosehoover_vxi);
 +    sfree(state->therm_integral);
 +    sfree(state->nhpres_xi);
 +    sfree(state->nhpres_vxi);
 +    sfree(state);
 +}  
 +
 +void trotter_update(t_inputrec *ir,gmx_large_int_t step, gmx_ekindata_t *ekind, 
 +                    gmx_enerdata_t *enerd, t_state *state, 
 +                    tensor vir, t_mdatoms *md, 
 +                    t_extmass *MassQ, int **trotter_seqlist, int trotter_seqno) 
 +{
 +    
 +    int n,i,j,d,ntgrp,ngtc,gc=0;
 +    t_grp_tcstat *tcstat;
 +    t_grpopts *opts;
 +    gmx_large_int_t step_eff;
 +    real ecorr,pcorr,dvdlcorr;
 +    real bmass,qmass,reft,kT,dt,nd;
 +    tensor dumpres,dumvir;
 +    double *scalefac,dtc;
 +    int *trotter_seq;
 +    rvec sumv={0,0,0},consk;
 +    gmx_bool bCouple;
 +
 +    if (trotter_seqno <= ettTSEQ2)
 +    {
 +        step_eff = step-1;  /* the velocity verlet calls are actually out of order -- the first half step
 +                               is actually the last half step from the previous step.  Thus the first half step
 +                               actually corresponds to the n-1 step*/
 +                               
 +    } else {
 +        step_eff = step;
 +    }
 +
 +    bCouple = (ir->nsttcouple == 1 ||
 +               do_per_step(step_eff+ir->nsttcouple,ir->nsttcouple));
 +
 +    trotter_seq = trotter_seqlist[trotter_seqno];
 +
-     dtc = ir->nsttcouple*ir->delta_t;
++    if ((trotter_seq[0] == etrtSKIPALL) || (!bCouple))
 +    {
 +        return;
 +    }
++    dtc = ir->nsttcouple*ir->delta_t;  /* This is OK for NPT, because nsttcouple == nstpcouple is enforcesd */
 +    opts = &(ir->opts); /* just for ease of referencing */
 +    ngtc = opts->ngtc;
 +    assert(ngtc>0);
 +    snew(scalefac,opts->ngtc);
 +    for (i=0;i<ngtc;i++) 
 +    {
 +        scalefac[i] = 1;
 +    }
 +    /* execute the series of trotter updates specified in the trotterpart array */
 +    
 +    for (i=0;i<NTROTTERPARTS;i++){
 +        /* allow for doubled intgrators by doubling dt instead of making 2 calls */
 +        if ((trotter_seq[i] == etrtBAROV2) || (trotter_seq[i] == etrtBARONHC2) || (trotter_seq[i] == etrtNHC2))
 +        {
 +            dt = 2 * dtc;
 +        }
 +        else 
 +        {
 +            dt = dtc;
 +        }
 +
 +        switch (trotter_seq[i])
 +        {
 +        case etrtBAROV:
 +        case etrtBAROV2:
 +            boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
 +                         enerd->term[F_PDISPCORR],MassQ);
 +            break;
 +        case etrtBARONHC:
 +        case etrtBARONHC2:
 +            NHC_trotter(opts,state->nnhpres,ekind,dt,state->nhpres_xi,
 +                        state->nhpres_vxi,NULL,&(state->veta),MassQ,FALSE);      
 +            break;
 +        case etrtNHC:
 +        case etrtNHC2:
 +            NHC_trotter(opts,opts->ngtc,ekind,dt,state->nosehoover_xi,
 +                        state->nosehoover_vxi,scalefac,NULL,MassQ,(ir->eI==eiVV));
 +            /* need to rescale the kinetic energies and velocities here.  Could 
 +               scale the velocities later, but we need them scaled in order to 
 +               produce the correct outputs, so we'll scale them here. */
 +            
 +            for (i=0; i<ngtc;i++) 
 +            {
 +                tcstat = &ekind->tcstat[i];
 +                tcstat->vscale_nhc = scalefac[i]; 
 +                tcstat->ekinscaleh_nhc *= (scalefac[i]*scalefac[i]); 
 +                tcstat->ekinscalef_nhc *= (scalefac[i]*scalefac[i]); 
 +            }
 +            /* now that we've scaled the groupwise velocities, we can add them up to get the total */
 +            /* but do we actually need the total? */
 +            
 +            /* modify the velocities as well */
 +            for (n=md->start;n<md->start+md->homenr;n++) 
 +            {
 +                if (md->cTC)   /* does this conditional need to be here? is this always true?*/
 +                { 
 +                    gc = md->cTC[n];
 +                }
 +                for (d=0;d<DIM;d++) 
 +                {
 +                    state->v[n][d] *= scalefac[gc];
 +                }
 +                
 +                if (debug) 
 +                {
 +                    for (d=0;d<DIM;d++) 
 +                    {
 +                        sumv[d] += (state->v[n][d])/md->invmass[n];
 +                    }
 +                }
 +            }          
 +            break;
 +        default:
 +            break;
 +        }
 +    }
 +    /* check for conserved momentum -- worth looking at this again eventually, but not working right now.*/  
 +#if 0
 +    if (debug) 
 +    {
 +        if (bFirstHalf) 
 +        {
 +            for (d=0;d<DIM;d++) 
 +            {
 +                consk[d] = sumv[d]*exp((1 + 1.0/opts->nrdf[0])*((1.0/DIM)*log(det(state->box)/state->vol0)) + state->nosehoover_xi[0]); 
 +            }
 +            fprintf(debug,"Conserved kappa: %15.8f %15.8f %15.8f\n",consk[0],consk[1],consk[2]);    
 +        }
 +    }
 +#endif
 +    sfree(scalefac);
 +}
 +
 +
 +extern void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit)
 +{
 +    int n,i,j,d,ntgrp,ngtc,nnhpres,nh,gc=0;
 +    t_grp_tcstat *tcstat;
 +    t_grpopts *opts;
 +    real ecorr,pcorr,dvdlcorr;
 +    real bmass,qmass,reft,kT,dt,ndj,nd;
 +    tensor dumpres,dumvir;
 +
 +    opts = &(ir->opts); /* just for ease of referencing */
 +    ngtc = ir->opts.ngtc;
 +    nnhpres = state->nnhpres;
 +    nh = state->nhchainlength; 
 +
 +    if (ir->eI == eiMD) {
 +        if (bInit)
 +        {
 +            snew(MassQ->Qinv,ngtc);
 +        }
 +        for(i=0; (i<ngtc); i++) 
 +        { 
 +            if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0)) 
 +            {
 +                MassQ->Qinv[i]=1.0/(sqr(opts->tau_t[i]/M_2PI)*opts->ref_t[i]);
 +            } 
 +            else 
 +            {
 +                MassQ->Qinv[i]=0.0;     
 +            }
 +        }
 +    }
 +    else if (EI_VV(ir->eI))
 +    {
 +    /* Set pressure variables */
 +
 +        if (bInit)
 +        {
 +            if (state->vol0 == 0)
 +            {
 +                state->vol0 = det(state->box); 
 +                /* because we start by defining a fixed
 +                   compressibility, we need the volume at this
 +                   compressibility to solve the problem. */
 +            }
 +        }
 +
 +        /* units are nm^3 * ns^2 / (nm^3 * bar / kJ/mol) = kJ/mol  */
 +        /* Consider evaluating eventually if this the right mass to use.  All are correct, some might be more stable  */
 +        MassQ->Winv = (PRESFAC*trace(ir->compress)*BOLTZ*opts->ref_t[0])/(DIM*state->vol0*sqr(ir->tau_p/M_2PI));
 +        /* An alternate mass definition, from Tuckerman et al. */ 
 +        /* MassQ->Winv = 1.0/(sqr(ir->tau_p/M_2PI)*(opts->nrdf[0]+DIM)*BOLTZ*opts->ref_t[0]); */
 +        for (d=0;d<DIM;d++) 
 +        {
 +            for (n=0;n<DIM;n++) 
 +            {
 +                MassQ->Winvm[d][n]= PRESFAC*ir->compress[d][n]/(state->vol0*sqr(ir->tau_p/M_2PI)); 
 +                /* not clear this is correct yet for the anisotropic case. Will need to reevaluate
 +                 before using MTTK for anisotropic states.*/
 +            }
 +        }
 +        /* Allocate space for thermostat variables */
 +        if (bInit)
 +        {
 +            snew(MassQ->Qinv,ngtc*nh);
 +        }
 +
 +        /* now, set temperature variables */
 +        for (i=0; i<ngtc; i++)
 +        {
 +            if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0))
 +            {
 +                reft = max(0.0,opts->ref_t[i]);
 +                nd = opts->nrdf[i];
 +                kT = BOLTZ*reft;
 +                for (j=0;j<nh;j++)
 +                {
 +                    if (j==0)
 +                    {
 +                        ndj = nd;
 +                    }
 +                    else
 +                    {
 +                        ndj = 1;
 +                    }
 +                    MassQ->Qinv[i*nh+j]   = 1.0/(sqr(opts->tau_t[i]/M_2PI)*ndj*kT);
 +                }
 +            }
 +            else
 +            {
 +                reft=0.0;
 +                for (j=0;j<nh;j++)
 +                {
 +                    MassQ->Qinv[i*nh+j] = 0.0;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +int **init_npt_vars(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bTrotter)
 +{
 +    int n,i,j,d,ntgrp,ngtc,nnhpres,nh,gc=0;
 +    t_grp_tcstat *tcstat;
 +    t_grpopts *opts;
 +    real ecorr,pcorr,dvdlcorr;
 +    real bmass,qmass,reft,kT,dt,ndj,nd;
 +    tensor dumpres,dumvir;
 +    int **trotter_seq;
 +
 +    opts = &(ir->opts); /* just for ease of referencing */
 +    ngtc = state->ngtc;
 +    nnhpres = state->nnhpres;
 +    nh = state->nhchainlength;
 +
 +    init_npt_masses(ir, state, MassQ, TRUE);
 +    
 +    /* first, initialize clear all the trotter calls */
 +    snew(trotter_seq,ettTSEQMAX);
 +    for (i=0;i<ettTSEQMAX;i++) 
 +    {
 +        snew(trotter_seq[i],NTROTTERPARTS);
 +        for (j=0;j<NTROTTERPARTS;j++) {
 +            trotter_seq[i][j] = etrtNONE;
 +        }
 +        trotter_seq[i][0] = etrtSKIPALL;
 +    }
 +    
 +    if (!bTrotter) 
 +    {
 +        /* no trotter calls, so we never use the values in the array.
 +         * We access them (so we need to define them, but ignore
 +         * then.*/
 +
 +        return trotter_seq;
 +    }
 +
 +    /* compute the kinetic energy by using the half step velocities or
 +     * the kinetic energies, depending on the order of the trotter calls */
 +
 +    if (ir->eI==eiVV)
 +    {
 +        if (IR_NPT_TROTTER(ir)) 
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV;
 +
 +            /* trotter_seq[1] is etrtNHC for 1/2 step velocities - leave zero */
 +
 +            /* The first half trotter update */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtNHC;
 +            trotter_seq[2][2] = etrtBARONHC;
 +
 +            /* The second half trotter update */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtNHC;
 +            trotter_seq[3][2] = etrtBAROV;
 +
 +            /* trotter_seq[4] is etrtNHC for second 1/2 step velocities - leave zero */
 +            
 +        } 
 +        else if (IR_NVT_TROTTER(ir))
 +        {
 +            /* This is the easy version - there are only two calls, both the same.
 +               Otherwise, even easier -- no calls  */
 +            trotter_seq[2][0] = etrtNHC;
 +            trotter_seq[3][0] = etrtNHC;
 +        }
 +        else if (IR_NPH_TROTTER(ir))
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV;
 +
 +            /* trotter_seq[1] is etrtNHC for 1/2 step velocities - leave zero */
 +
 +            /* The first half trotter update */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtBARONHC;
 +
 +            /* The second half trotter update */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtBAROV;
 +
 +            /* trotter_seq[4] is etrtNHC for second 1/2 step velocities - leave zero */
 +        }
 +    }
 +    else if (ir->eI==eiVVAK)
 +    {
 +        if (IR_NPT_TROTTER(ir))
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV;
 +
 +            /* The first half trotter update, part 1 -- double update, because it commutes */
 +            trotter_seq[1][0] = etrtNHC;
 +
 +            /* The first half trotter update, part 2 */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtBARONHC;
 +            
 +            /* The second half trotter update, part 1 */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtBAROV;
 +
 +            /* The second half trotter update */
 +            trotter_seq[4][0] = etrtNHC;
 +        } 
 +        else if (IR_NVT_TROTTER(ir))
 +        {
 +            /* This is the easy version - there is only one call, both the same.
 +               Otherwise, even easier -- no calls  */
 +            trotter_seq[1][0] = etrtNHC;
 +            trotter_seq[4][0] = etrtNHC;
 +        }
 +        else if (IR_NPH_TROTTER(ir))
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV; 
 +
 +            /* The first half trotter update, part 1 -- leave zero */
 +            trotter_seq[1][0] = etrtNHC;
 +
 +            /* The first half trotter update, part 2 */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtBARONHC;
 +
 +            /* The second half trotter update, part 1 */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtBAROV;
 +
 +            /* The second half trotter update -- blank for now */
 +        }
 +    }
 +
 +    switch (ir->epct)
 +    {
 +    case epctISOTROPIC:  
 +    default:
 +        bmass = DIM*DIM;  /* recommended mass parameters for isotropic barostat */
 +    }    
 +
 +    snew(MassQ->QPinv,nnhpres*opts->nhchainlength);
 +
 +    /* barostat temperature */
 +    if ((ir->tau_p > 0) && (opts->ref_t[0] > 0))
 +    {
 +        reft = max(0.0,opts->ref_t[0]);
 +        kT = BOLTZ*reft;
 +        for (i=0;i<nnhpres;i++) {
 +            for (j=0;j<nh;j++) 
 +            {
 +                if (j==0) {
 +                    qmass = bmass;
 +                } 
 +                else 
 +                {
 +                    qmass = 1;
 +                }
 +                MassQ->QPinv[i*opts->nhchainlength+j]   = 1.0/(sqr(opts->tau_t[0]/M_2PI)*qmass*kT);
 +            }
 +        }
 +    }
 +    else 
 +    {
 +        for (i=0;i<nnhpres;i++) {
 +            for (j=0;j<nh;j++) 
 +            {
 +                MassQ->QPinv[i*nh+j]=0.0;
 +            }
 +        }
 +    }    
 +    return trotter_seq;
 +}
 +
 +real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
 +{
 +    int  i,j,nd,ndj,bmass,qmass,ngtcall;
 +    real ener_npt,reft,eta,kT,tau;
 +    double *ivxi, *ixi;
 +    double *iQinv;
 +    real vol,dbaro,W,Q;
 +    int nh = state->nhchainlength;
 +
 +    ener_npt = 0;
 +    
 +    /* now we compute the contribution of the pressure to the conserved quantity*/
 +    
 +    if (ir->epc==epcMTTK) 
 +    {
 +        /* find the volume, and the kinetic energy of the volume */
 +        
 +        switch (ir->epct) {
 +            
 +        case epctISOTROPIC:
 +            /* contribution from the pressure momenenta */
 +            ener_npt += 0.5*sqr(state->veta)/MassQ->Winv;
 +            
 +            /* contribution from the PV term */
 +            vol = det(state->box);
 +            ener_npt += vol*trace(ir->ref_p)/(DIM*PRESFAC);
 +
 +            break;
 +        case epctANISOTROPIC:
 +            
 +            break;
 +            
 +        case epctSURFACETENSION:
 +            
 +            break;
 +        case epctSEMIISOTROPIC:
 +            
 +            break;
 +        default:
 +            break;
 +        }
 +    }
 +    
 +    if (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir))
 +    {
 +        /* add the energy from the barostat thermostat chain */
 +        for (i=0;i<state->nnhpres;i++) {
 +
 +            /* note -- assumes only one degree of freedom that is thermostatted in barostat */
 +            ivxi = &state->nhpres_vxi[i*nh];
 +            ixi = &state->nhpres_xi[i*nh];
 +            iQinv = &(MassQ->QPinv[i*nh]);
 +            reft = max(ir->opts.ref_t[0],0); /* using 'System' temperature */
 +            kT = BOLTZ * reft;
 +        
 +            for (j=0;j<nh;j++) 
 +            {
 +                if (iQinv[j] > 0)
 +                {
 +                    ener_npt += 0.5*sqr(ivxi[j])/iQinv[j];
 +                    /* contribution from the thermal variable of the NH chain */
 +                    ener_npt += ixi[j]*kT;
 +                }
 +                if (debug) 
 +                {
 +                    fprintf(debug,"P-T-group: %10d Chain %4d ThermV: %15.8f ThermX: %15.8f",i,j,ivxi[j],ixi[j]);
 +                }
 +            }
 +        }
 +    }
 +        
 +    if (ir->etc) 
 +    {
 +        for(i=0; i<ir->opts.ngtc; i++) 
 +        {
 +            ixi = &state->nosehoover_xi[i*nh];
 +            ivxi = &state->nosehoover_vxi[i*nh];
 +            iQinv = &(MassQ->Qinv[i*nh]);
 +            
 +            nd = ir->opts.nrdf[i];
 +            reft = max(ir->opts.ref_t[i],0);
 +            kT = BOLTZ * reft;
 +            
 +            if (nd > 0) 
 +            {
 +                if (IR_NVT_TROTTER(ir))
 +                {
 +                    /* contribution from the thermal momenta of the NH chain */
 +                    for (j=0;j<nh;j++) 
 +                    {
 +                        if (iQinv[j] > 0) {
 +                            ener_npt += 0.5*sqr(ivxi[j])/iQinv[j];
 +                            /* contribution from the thermal variable of the NH chain */
 +                            if (j==0) {
 +                                ndj = nd;
 +                            } 
 +                            else 
 +                            {
 +                                ndj = 1;
 +                            } 
 +                            ener_npt += ndj*ixi[j]*kT;
 +                        }
 +                    }
 +                }
 +                else  /* Other non Trotter temperature NH control  -- no chains yet. */
 +                { 
 +                    ener_npt += 0.5*BOLTZ*nd*sqr(ivxi[0])/iQinv[0];
 +                    ener_npt += nd*ixi[0]*kT;
 +                }
 +            }
 +        }
 +    }
 +    return ener_npt;
 +}
 +
 +static real vrescale_gamdev(int ia, gmx_rng_t rng)
 +/* Gamma distribution, adapted from numerical recipes */
 +{
 +    int j;
 +    real am,e,s,v1,v2,x,y;
 +
 +    if (ia < 6)
 +    {
 +        do
 +        {
 +            x = 1.0;
 +            for(j=1; j<=ia; j++)
 +            {
 +                x *= gmx_rng_uniform_real(rng);
 +            }
 +        }
 +        while (x == 0);
 +        x = -log(x);
 +    }
 +    else
 +    {
 +        do
 +        {
 +            do
 +            {
 +                do
 +                {
 +                    v1 = gmx_rng_uniform_real(rng);
 +                    v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
 +                }
 +                while (v1*v1 + v2*v2 > 1.0 ||
 +                       v1*v1*GMX_REAL_MAX < 3.0*ia);
 +                /* The last check above ensures that both x (3.0 > 2.0 in s)
 +                 * and the pre-factor for e do not go out of range.
 +                 */
 +                y = v2/v1;
 +                am = ia - 1;
 +                s = sqrt(2.0*am + 1.0);
 +                x = s*y + am;
 +            }
 +            while (x <= 0.0);
 +            e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
 +        }
 +        while (gmx_rng_uniform_real(rng) > e);
 +    }
 +
 +    return x;
 +}
 +
 +static real vrescale_sumnoises(int nn,gmx_rng_t rng)
 +{
 +/*
 + * Returns the sum of n independent gaussian noises squared
 + * (i.e. equivalent to summing the square of the return values
 + * of nn calls to gmx_rng_gaussian_real).xs
 + */
 +  real rr;
 +
 +  if (nn == 0) {
 +    return 0.0;
 +  } else if (nn == 1) {
 +    rr = gmx_rng_gaussian_real(rng);
 +    return rr*rr;
 +  } else if (nn % 2 == 0) {
 +    return 2.0*vrescale_gamdev(nn/2,rng);
 +  } else {
 +    rr = gmx_rng_gaussian_real(rng);
 +    return 2.0*vrescale_gamdev((nn-1)/2,rng) + rr*rr;
 +  }
 +}
 +
 +static real vrescale_resamplekin(real kk,real sigma, int ndeg, real taut,
 +                               gmx_rng_t rng)
 +{
 +/*
 + * Generates a new value for the kinetic energy,
 + * according to Bussi et al JCP (2007), Eq. (A7)
 + * kk:    present value of the kinetic energy of the atoms to be thermalized (in arbitrary units)
 + * sigma: target average value of the kinetic energy (ndeg k_b T/2)  (in the same units as kk)
 + * ndeg:  number of degrees of freedom of the atoms to be thermalized
 + * taut:  relaxation time of the thermostat, in units of 'how often this routine is called'
 + */
 +  real factor,rr;
 +
 +  if (taut > 0.1) {
 +    factor = exp(-1.0/taut);
 +  } else {
 +    factor = 0.0;
 +  }
 +  rr = gmx_rng_gaussian_real(rng);
 +  return
 +    kk +
 +    (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1,rng) + rr*rr)/ndeg - kk) +
 +    2.0*rr*sqrt(kk*sigma/ndeg*(1.0 - factor)*factor);
 +}
 +
 +void vrescale_tcoupl(t_inputrec *ir,gmx_ekindata_t *ekind,real dt,
 +                     double therm_integral[],gmx_rng_t rng)
 +{
 +    t_grpopts *opts;
 +    int    i;
 +    real   Ek,Ek_ref1,Ek_ref,Ek_new; 
 +    
 +    opts = &ir->opts;
 +
 +    for(i=0; (i<opts->ngtc); i++)
 +    {
 +        if (ir->eI == eiVV)
 +        {
 +            Ek = trace(ekind->tcstat[i].ekinf);
 +        }
 +        else
 +        {
 +            Ek = trace(ekind->tcstat[i].ekinh);
 +        }
 +        
 +        if (opts->tau_t[i] > 0 && opts->nrdf[i] > 0 && Ek > 0)
 +        {
 +            Ek_ref1 = 0.5*opts->ref_t[i]*BOLTZ;
 +            Ek_ref  = Ek_ref1*opts->nrdf[i];
 +
 +            Ek_new  = vrescale_resamplekin(Ek,Ek_ref,opts->nrdf[i],
 +                                           opts->tau_t[i]/dt,rng);
 +
 +            /* Analytically Ek_new>=0, but we check for rounding errors */
 +            if (Ek_new <= 0)
 +            {
 +                ekind->tcstat[i].lambda = 0.0;
 +            }
 +            else
 +            {
 +                ekind->tcstat[i].lambda = sqrt(Ek_new/Ek);
 +            }
 +
 +            therm_integral[i] -= Ek_new - Ek;
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"TC: group %d: Ekr %g, Ek %g, Ek_new %g, Lambda: %g\n",
 +                        i,Ek_ref,Ek,Ek_new,ekind->tcstat[i].lambda);
 +            }
 +        }
 +        else
 +        {
 +            ekind->tcstat[i].lambda = 1.0;
 +        }
 +    }
 +}
 +
 +real vrescale_energy(t_grpopts *opts,double therm_integral[])
 +{
 +  int i;
 +  real ener;
 +
 +  ener = 0;
 +  for(i=0; i<opts->ngtc; i++) {
 +    ener += therm_integral[i];
 +  }
 +  
 +  return ener;
 +}
 +
 +void rescale_velocities(gmx_ekindata_t *ekind,t_mdatoms *mdatoms,
 +                        int start,int end,rvec v[])
 +{
 +    t_grp_acc      *gstat;
 +    t_grp_tcstat   *tcstat;
 +    unsigned short *cACC,*cTC;
 +    int  ga,gt,n,d;
 +    real lg;
 +    rvec vrel;
 +
 +    tcstat = ekind->tcstat;
 +    cTC    = mdatoms->cTC;
 +
 +    if (ekind->bNEMD)
 +    {
 +        gstat  = ekind->grpstat;
 +        cACC   = mdatoms->cACC;
 +
 +        ga = 0;
 +        gt = 0;
 +        for(n=start; n<end; n++) 
 +        {
 +            if (cACC) 
 +            {
 +                ga   = cACC[n];
 +            }
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            /* Only scale the velocity component relative to the COM velocity */
 +            rvec_sub(v[n],gstat[ga].u,vrel);
 +            lg = tcstat[gt].lambda;
 +            for(d=0; d<DIM; d++)
 +            {
 +                v[n][d] = gstat[ga].u[d] + lg*vrel[d];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        gt = 0;
 +        for(n=start; n<end; n++) 
 +        {
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg = tcstat[gt].lambda;
 +            for(d=0; d<DIM; d++)
 +            {
 +                v[n][d] *= lg;
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* set target temperatures if we are annealing */
 +void update_annealing_target_temp(t_grpopts *opts,real t)
 +{
 +  int i,j,n,npoints;
 +  real pert,thist=0,x;
 +
 +  for(i=0;i<opts->ngtc;i++) {
 +    npoints = opts->anneal_npoints[i];
 +    switch (opts->annealing[i]) {
 +    case eannNO:
 +      continue;
 +    case  eannPERIODIC:
 +      /* calculate time modulo the period */
 +      pert  = opts->anneal_time[i][npoints-1];
 +      n     = t / pert;
 +      thist = t - n*pert; /* modulo time */
 +      /* Make sure rounding didn't get us outside the interval */
 +      if (fabs(thist-pert) < GMX_REAL_EPS*100)
 +      thist=0;
 +      break;
 +    case eannSINGLE:
 +      thist = t;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Death horror in update_annealing_target_temp (i=%d/%d npoints=%d)",i,opts->ngtc,npoints);
 +    }
 +    /* We are doing annealing for this group if we got here, 
 +     * and we have the (relative) time as thist.
 +     * calculate target temp */
 +    j=0;
 +    while ((j < npoints-1) && (thist>(opts->anneal_time[i][j+1])))
 +      j++;
 +    if (j < npoints-1) {
 +      /* Found our position between points j and j+1. 
 +       * Interpolate: x is the amount from j+1, (1-x) from point j 
 +       * First treat possible jumps in temperature as a special case.
 +       */
 +      if ((opts->anneal_time[i][j+1]-opts->anneal_time[i][j]) < GMX_REAL_EPS*100)
 +      opts->ref_t[i]=opts->anneal_temp[i][j+1];
 +      else {
 +      x = ((thist-opts->anneal_time[i][j])/
 +           (opts->anneal_time[i][j+1]-opts->anneal_time[i][j]));
 +      opts->ref_t[i] = x*opts->anneal_temp[i][j+1]+(1-x)*opts->anneal_temp[i][j];
 +      }
 +    }
 +    else {
 +      opts->ref_t[i] = opts->anneal_temp[i][npoints-1];
 +    }
 +  }
 +}
index e834fec52d3ab4136d37729a7eb85b5f8f240a52,0000000000000000000000000000000000000000..2cc197225168b48489a173ef51cfd552f575bfac
mode 100644,000000..100644
--- /dev/null
@@@ -1,9589 -1,0 +1,9643 @@@
-     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#define DDRANK(dd,rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int  *ncg;     /* Number of home charge groups for each node */
 +    int  *index;   /* Index of nnodes+1 into cg */
 +    int  *cg;      /* Global charge group index */
 +    int  *nat;     /* Number of home atoms for each node. */
 +    int  *ibuf;    /* Buffer for communication */
 +    rvec *vbuf;    /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int nsend[DD_MAXIZONE+2];
 +    int nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int nalloc;
 +    /* The atom range for non-in-place communication */
 +    int cell2at0[DD_MAXIZONE];
 +    int cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int  np;                   /* Number of grid pulses in this dimension */
 +    int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 +    int  np_nalloc;
 +    gmx_bool bInPlace;             /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 +    real *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int  nload;
 +    float *load;
 +    float sum;
 +    float max;
 +    float sum_m;
 +    float cvol_min;
 +    float mdf;
 +    float pme;
 +    int   flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int  sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int  sort_new_nalloc;
 +    int  *ibuf;
 +    int  ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int  nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 +
 +enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int  dim;      /* The dimension                                          */
 +    gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 +    int  nslab;    /* The number of PME slabs in this dimension              */
 +    real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int  *pp_min;  /* The minimum pp node location, size nslab               */
 +    int  *pp_max;  /* The maximum pp node location,size nslab                */
 +    int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int *ibuf;
 +    int ibuf_nalloc;
 +    vec_rvec_t vbuf;
 +    int nsend;
 +    int nat;
 +    int nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int  npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int  npmenodes;
 +    int  npmenodes_x;
 +    int  npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool bCartesianPP_PME;
 +    ivec ntot;
 +    int  cartpmedim;
 +    int  *pmenodes;          /* size npmenodes                         */
 +    int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                              * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +    
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +    
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int  nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +    
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool bBondComm;
 +    t_blocka *cglink;
 +    char *bLocalCG;
 +
 +    /* The DLB option */
 +    int  eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real cutoff_mbody;
 +    real cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
++    /* With PME load balancing we set limits on DLB */
++    gmx_bool bPMELoadBalDLBLimits;
++    /* DLB needs to take into account that we want to allow this maximum
++     * cut-off (for PME load balancing), this could limit cell boundaries.
++     */
++    real PMELoadBal_max_cutoff;
++
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +    
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +    
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +    
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int  maxpulse;
 +    
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +    
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +    
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int  moved_nalloc;
 +    
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int  nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int  *buf_int2;
 +    int  nalloc_int2;
 +    vec_rvec_t vbuf2;
 +    
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int  cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int  cgcm_state_nalloc[DIM*2];
 +    
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real *cell_f_row;
 +    real cell_f0[DIM];
 +    real cell_f1[DIM];
 +    real cell_f_max0[DIM];
 +    real cell_f_min1[DIM];
 +    
 +    /* Stuff for load communication */
 +    gmx_bool bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float cycl[ddCyclNr];
 +    int   cycl_n[ddCyclNr];
 +    float cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +    
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +  {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +static void index2xyz(ivec nc,int ind,ivec xyz)
 +{
 +  xyz[XX] = ind % nc[XX];
 +  xyz[YY] = (ind / nc[XX]) % nc[YY];
 +  xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +}
 +*/
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid=-1;
 +    
 +    ddindex = dd_index(dd->nc,c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +    
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd,int i)
 +{
 +    int atnr;
 +    
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +    
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v,v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd,t_state *state)
 +{
 +    int i;
 +    
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +    
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl,state->cg_gl_nalloc);
 +    }
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +    
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 +                      int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int izone,d,dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +    
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg,izone,zones->nizone);
 +    }
 +        
 +    *jcg1 = zones->izone[izone].jcg1;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec shift={0,0,0},*buf,*rbuf;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = comm->vbuf.v;
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]],shift);
 +        }
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        copy_rvec(x[j],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j],shift,buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j],x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec *buf,*sbuf;
 +    ivec vis;
 +    int  is;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is = IVEC2IS(vis);
 +        
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i],sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        n++;
 +                    }
 +                } 
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is],buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*rbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*sbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            } 
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 +{
 +    fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d,i,j,
 +            zone->min0,zone->max1,
 +            zone->mch0,zone->mch0,
 +            zone->p1_0,zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind,int direction,
 +                               gmx_ddzone_t *buf_s,int n_s,
 +                               gmx_ddzone_t *buf_r,int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int i;
 +
 +    for(i=0; i<n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for(i=0; i<n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0,rvec cell_ns_x1)
 +{
 +    int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 +    gmx_ddzone_t *zp;
 +    gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
 +    rvec extr_s[2],extr_r[2];
 +    rvec dh;
 +    real dist_d,c=0,det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bPBC,bUse;
 +
 +    comm = dd->comm;
 +
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +    
 +    for(d=dd->ndim-2; d>=0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for(d1=d; d1<dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse,dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for(p=0; p<npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for(p=0; p<npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for(d1=d+1; d1<dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for(i=0; i<buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */ 
 +                pos = 0;
 +
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for(i=d; i<2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for(i=0; i<2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0; j<2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state *state_local)
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
 +    t_block *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +    
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    } 
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for(i=0; i<ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +    
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd,2*sizeof(int),buf2,ibuf);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ncg[i] = ma->ibuf[2*i];
 +            ma->nat[i] = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +            
 +        }
 +        /* Make byte counts and indices */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"Initial charge group distribution: ");
 +            for(i=0; i<dd->nnodes; i++)
 +                fprintf(debug," %d",ma->ncg[i]);
 +            fprintf(debug,"\n");
 +        }
 +    }
 +    
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int),dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +    
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +
 +    ma = dd->ma;
 +    
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 dd->rank,dd->mpi_comm_all);
 +#endif
 +    } else {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++],v[c]);
 +            }
 +        }
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
 +                         n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++],v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts,int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int n;
 +
 +    ma = dd->ma;
 +    
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for(n=0; n<dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *rcounts=NULL,*disps=NULL;
 +    int  n,i,c,a;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +    
 +    ma = dd->ma;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd,&rcounts,&disps);
 +
 +        buf = ma->vbuf;
 +    }
 +    
 +    dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++],v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local,rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    dd_collect_cg(dd,state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd,lv,v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd,lv,v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local,t_state *state)
 +{
 +    int est,i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i=0;i<efptNR;i++) {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta = state_local->veta;
 +        state->vol0 = state_local->vol0;
 +        copy_mat(state_local->box,state->box);
 +        copy_mat(state_local->boxv,state->boxv);
 +        copy_mat(state_local->svir_prev,state->svir_prev);
 +        copy_mat(state_local->fvir_prev,state->fvir_prev);
 +        copy_mat(state_local->pres_prev,state->pres_prev);
 +
 +
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];            
 +        }
 +        for(i=0; i<state_local->nnhpres; i++) 
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est) {
 +            case estX:
 +                dd_collect_vec(dd,state_local,state_local->x,state->x);
 +                break;
 +            case estV:
 +                dd_collect_vec(dd,state_local,state_local->v,state->v);
 +                break;
 +            case estSDX:
 +                dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
 +                break;
 +            case estCGP:
 +                dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    if (DDMASTER(dd))
 +                    {
 +                        for(i=0; i<state_local->nrng; i++)
 +                        {
 +                            state->ld_rng[i] = state_local->ld_rng[i];
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
 +                              state_local->ld_rng,state->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                   if (DDMASTER(dd))
 +                    {
 +                        state->ld_rngi[0] = state_local->ld_rngi[0];
 +                    } 
 +                }
 +                else
 +                {
 +                    dd_gather(dd,sizeof(state->ld_rngi[0]),
 +                              state_local->ld_rngi,state->ld_rngi);
 +                }
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +    
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch(est) {
 +            case estX:
 +                srenew(state->x,state->nalloc);
 +                break;
 +            case estV:
 +                srenew(state->v,state->nalloc);
 +                break;
 +            case estSDX:
 +                srenew(state->sd_X,state->nalloc);
 +                break;
 +            case estCGP:
 +                srenew(state->cg_p,state->nalloc);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No reallocation required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_realloc_state");            
 +            }
 +        }
 +    }
 +    
 +    if (f != NULL)
 +    {
 +        srenew(*f,state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo,fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm,fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state,f,nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c],buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
 +                              a,ma->nat[n]);
 +                }
 +                
 +#ifdef GMX_MPI
 +                MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
 +                         DDRANK(dd,n),n,dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c],lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *scounts=NULL,*disps=NULL;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +     
 +        get_commbuffer_counts(dd,&scounts,&disps);
 +
 +        buf = ma->vbuf;
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c],buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd,cgs,v,lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd,cgs,v,lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
 +                                t_state *state,t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for(i=0;i<efptNR;i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta   = state->veta;
 +        state_local->vol0   = state->vol0;
 +        copy_mat(state->box,state_local->box);
 +        copy_mat(state->box_rel,state_local->box_rel);
 +        copy_mat(state->boxv,state_local->boxv);
 +        copy_mat(state->svir_prev,state_local->svir_prev);
 +        copy_mat(state->fvir_prev,state_local->fvir_prev);
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for(i=0; i<state_local->nnhpres; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
 +    dd_bcast(dd,sizeof(int),&state_local->fep_state);
 +    dd_bcast(dd,sizeof(real),&state_local->veta);
 +    dd_bcast(dd,sizeof(real),&state_local->vol0);
 +    dd_bcast(dd,sizeof(state_local->box),state_local->box);
 +    dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
 +    dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
 +    dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
 +    dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
 +    dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,dd->nat_home);
 +    }
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i) {
 +            case estX:
 +                dd_distribute_vec(dd,cgs,state->x,state_local->x);
 +                break;
 +            case estV:
 +                dd_distribute_vec(dd,cgs,state->v,state_local->v);
 +                break;
 +            case estSDX:
 +                dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
 +                break;
 +            case estCGP:
 +                dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,
 +                              state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                              state->ld_rng,state_local->ld_rng);
 +                }
 +                else
 +                {
 +                    dd_scatter(dd,
 +                               state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                               state->ld_rng,state_local->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
 +                              state->ld_rngi,state_local->ld_rngi);
 +                }
 +                else
 +                {
 +                     dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
 +                               state->ld_rngi,state_local->ld_rngi);
 +                }   
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* Not implemented yet */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c='?';
 +    
 +    switch (dim)
 +    {
 +    case XX: c = 'X'; break;
 +    case YY: c = 'Y'; break;
 +    case ZZ: c = 'Z'; break;
 +    default: gmx_fatal(FARGS,"Unknown dim %d",dim);
 +    }
 +    
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
 +                              gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
 +{
 +    rvec grid_s[2],*grid_r=NULL,cx,r;
 +    char fname[STRLEN],format[STRLEN],buf[22];
 +    FILE *out;
 +    int  a,i,d,z,y,x;
 +    matrix tric;
 +    real vol;
 +
 +    copy_rvec(dd->comm->cell_x0,grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1,grid_s[1]);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r,2*dd->nnodes);
 +    }
 +    
 +    dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            for(i=0; i<DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
 +        sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname,"w");
 +        gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +        a = 1;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for(d=0; d<DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for(z=0; z<2; z++)
 +            {
 +                for(y=0; y<2; y++)
 +                {
 +                    for(x=0; x<2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric,cx,r);
 +                        fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
 +                                10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
 +                    }
 +                }
 +            }
 +            for(d=0; d<DIM; d++)
 +            {
 +                for(x=0; x<4; x++)
 +                {
 +                    switch(d)
 +                    {
 +                    case 0: y = 1 + i*8 + 2*x; break;
 +                    case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                    case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
 +                  gmx_mtop_t *mtop,t_commrec *cr,
 +                  int natoms,rvec x[],matrix box)
 +{
 +    char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
 +    FILE *out;
 +    int  i,ii,resnr,c;
 +    char *atomname,*resname;
 +    real b;
 +    gmx_domdec_t *dd;
 +    
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +    
 +    sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
 +    
 +    sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +    sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
 +    
 +    out = gmx_fio_fopen(fname,"w");
 +    
 +    fprintf(out,"TITLE     %s\n",title);
 +    gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +    for(i=0; i<natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out,strlen(atomname)<4 ? format : format4,
 +                "ATOM",(ii+1)%100000,
 +                atomname,resname,' ',resnr%10000,' ',
 +                10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
 +    }
 +    fprintf(out,"TER\n");
 +    
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  di;
 +    real r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for(di=1; di<dd->ndim; di++)
 +            {
 +                r = min(r,comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r,comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r,comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff,r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
 +{
 +    int nc,ntot;
 +    
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord,coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int n,i,p0,p1;
 +    
 +    snew(pmenodes,cr->npmenodes);
 +    n = 0;
 +    for(i=0; i<cr->dd->nnodes; i++) {
 +        p0 = cr_ddindex2pmeindex(cr,i);
 +        p1 = cr_ddindex2pmeindex(cr,i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0) {
 +            if (debug)
 +                fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec coords,coords_pme,nc;
 +    int  slab;
 +    
 +    dd = cr->dd;
 +    /*
 +      if (dd->comm->bCartesian) {
 +      gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +      dd_coords2pmecoords(dd,coords,coords_pme);
 +      copy_ivec(dd->ntot,nc);
 +      nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +      coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +      
 +      slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +      } else {
 +      slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +      }
 +    */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
 +    
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec coords;
 +    int  ddindex,nodeid=-1;
 +    
 +    comm = cr->dd->comm;
 +    
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc,coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +  
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec coord,coord_pme;
 +    int  i;
 +    int  pmenode=-1;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +    
 +    return pmenode;
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +    
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
 +                     int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int x,y,z;
 +    ivec coord,coord_pme;
 +    
 +    dd = cr->dd;
 +    
 +    snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +    
 +    *nmy_ddnodes = 0;
 +    for(x=0; x<dd->nc[XX]; x++)
 +    {
 +        for(y=0; y<dd->nc[YY]; y++)
 +        {
 +            for(z=0; z<dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Receive coordinates from PP nodes:");
 +        for(x=0; x<*nmy_ddnodes; x++)
 +        {
 +            fprintf(debug," %d",(*my_ddnodes)[x]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  pmenode,coords[DIM],rank;
 +    gmx_bool bReceive;
 +    
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
 +                if (dd_simnode2pmenode(cr,rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif  
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +    
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for(i=1; i<zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index,t_state *state)
 +{
 +    int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
 +    
 +    ind = state->cg_gl;
 +    dd_cg_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    nat = 0;
 +    cgindex[0] = nat;
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        cgindex[i] = nat;
 +        cg_gl = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +    
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
 +                          t_forcerec *fr,char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int *cginfo;
 +    int cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index,int cg_start)
 +{
 +    int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
 +    int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char *bLocalCG;
 +    gmx_bool bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex,dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +    
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la,a_gl,a,zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la,cg_gl,a,zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg,i,ngl,nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for(i=0; i<dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for(i=0; i<ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys,int ncg_sys,
 +                                    const char *where)
 +{
 +    int  nerr,ngl,i,a,cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have,natoms_sys);
 +        for(a=0; a<dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have,dd->nat_tot);
 +
 +    ngl  = 0;
 +    for(i=0; i<natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la,i,&a,&cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank,where,ngl,dd->nat_tot);
 +    }
 +    for(a=0; a<dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank,where,a+1,dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
 +
 +    if (nerr > 0) {
 +        gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank,where,nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
 +{
 +    int  i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for(i=a_start; i<dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la,dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for(i=cg_start; i<dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +    
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
++/* This function should be used for moving the domain boudaries during DLB,
++ * for obtaining the minimum cell size. It checks the initially set limit
++ * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
++ * and, possibly, a longer cut-off limit set for PME load balancing.
++ */
++static real cellsize_min_dlb(gmx_domdec_comm_t *comm,int dim_ind,int dim)
++{
++    real cellsize_min;
++
++    cellsize_min = comm->cellsize_min[dim];
++
++    if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
++    {
++        cellsize_min = max(cellsize_min,
++                           comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
++    }
++
++    return cellsize_min;
++}
++
 +static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
++        if (comm->bPMELoadBalDLBLimits)
++        {
++            cutoff = max(cutoff,comm->PMELoadBal_max_cutoff);
++        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t *dd,
 +                                real cutoff,
 +                                gmx_ddbox_t *ddbox,
 +                                gmx_bool bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim;
 +    real limit,bfac;
 +    gmx_bool bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +    
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        limit = grid_jump_limit(comm,cutoff,d);
 +        bfac = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +            (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step,buf),
 +                          dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +    
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    } 
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +    
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int i;
 +    
 +    comm = dd->comm;
 +    
 +    snew(*dim_f,dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for(i=1; i<dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
 +{
 +    int        pmeindex,slab,nso,i;
 +    ivec xyz;
 +    
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +    
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min,ddpme->nslab);
 +    snew(ddpme->pp_max,ddpme->nslab);
 +    for(slab=0; slab<ddpme->nslab; slab++) {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for(i=0; i<dd->nnodes; i++) {
 +        ddindex2xyz(dd->nc,i,xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
 +            pmeindex = ddindex2pmeindex(dd,i);
 +            if (dimind == 0) {
 +                slab = pmeindex/nso;
 +            } else {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  nc,ns,s;
 +    int  *xmin,*xmax;
 +    real range,pme_boundary;
 +    int  sh;
 +    
 +    comm = dd->comm;
 +    nc  = dd->nc[ddpme->dim];
 +    ns  = ddpme->nslab;
 +    
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +        
 +        sh = 1;
 +        for(s=0; s<ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +    
 +    ddpme->maxshift = sh;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"PME slab communication range for dim %d is %d\n",
 +                ddpme->dim,ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d,dim;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                      dd->nc[dim],dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster,ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,j;
 +    rvec cellsize_min;
 +    real *cell_x,cell_dx,cellsize;
 +    
 +    comm = dd->comm;
 +    
 +    for(d=0; d<DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d] = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for(j=0; j<dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x,dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for(j=0; j<dd->nc[d]; j++)
 +            {
 +                cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d],cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d],dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +    
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min,comm->cellsize_min);
 +    }
 +   
 +    for(d=0; d<comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd,&comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]]==NULL,ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,
 +                                       gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,i,j,nmin,nmin_old;
 +    gmx_bool bLimLo,bLimHi;
 +    real *cell_size;
 +    real fac,halfway,cellsize_limit_f_i,region_size;
 +    gmx_bool bPBC,bLastHi=FALSE;
 +    int nrange[]={range[0],range[1]};
 +
 +    region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug) 
 +    {
 +        fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for(i=range[0]; i<range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i] = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +    
 +    i=range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step,buf),
 +                  dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                  ncd,comm->cellsize_min[dim]);
 +    }
 +    
 +    root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
 +    
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for(i=range[0]+1; i<range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i+1; j<range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i-1; j>=range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for(i=range[0]; i<range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for(i=range[0]+1; i<range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]=range[0];
 +                    nrange[1]=i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi=FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi=TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0]=nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]=i; 
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0]=nrange[1];
 +                nrange[1]=range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            } 
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,d1,i,j,pos;
 +    real *cell_size;
 +    real load_aver,load_i,imbalance,change,change_max,sc;
 +    real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
 +    real change_limit;
 +    real relax = 0.5;
 +    gmx_bool bPBC;
 +    int range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for(i=0; i<ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform) {
 +        for(i=0; i<ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -relax*imbalance;
 +            change_max = max(change_max,max(change,-change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +    
-     if (DDMASTER(dd))
-     {
-         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
-           dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
-     }
++    cellsize_limit_f  = cellsize_min_dlb(comm,d,dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for(i=1; i<ncd; i++) {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0) {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0) {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d,i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i],root->cell_f[i],root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]=ncd;
 +    root->cell_f[0] = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for(i=0; i<ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim,i,root->cell_f[i],root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step,buf),dim2char(dim),i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +    
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for(d1=0; d1<d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +    
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}    
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox,int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d,int dim,real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d1,dim1,pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
 +              0,comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for(d1=0; d1<=d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd,ddbox,d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d,dim,d1;
 +    gmx_bool bRowMember,bRowRoot;
 +    real *cell_f_row;
 +    
 +    comm = dd->comm;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot = TRUE;
 +        for(d1=d; d1<dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
 +                                           ddbox,bDynamicBox,bUniform,step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
 +        }
 +    }
 +}    
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd,ddbox,d); 
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +    
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle,ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
 +        wallcycle_stop(wcycle,ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd,ddbox);
 +    }
 +    
 +    /* Set the dimensions for which no DD is used */
 +    for(dim=0; dim<DIM; dim++) {
 +        if (dd->nc[dim] == 1) {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
 +{
 +    int d,np,i;
 +    gmx_domdec_comm_dim_t *cd;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]),np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
 +            }
 +            srenew(cd->ind,np);
 +            for(i=cd->np_nalloc; i<np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                              gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec npulse;
 +    
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0,comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1,comm->old_cell_x1);
 +    
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd,ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
 +        realloc_comm_ind(dd,npulse);
 +    }
 +    
 +    if (debug)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
 +                    d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0,rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim_ind,dim;
 +    
 +    comm = dd->comm;
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim && 
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step,buf),dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +    }
 +    
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +    
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog,gmx_large_int_t step,
 +                          matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int **tmp_ind=NULL,*tmp_nalloc=NULL;
 +    int  i,icg,j,k,k0,k1,d,npbcdim;
 +    matrix tcm;
 +    rvec box_size,cg_cm;
 +    ivec ind;
 +    real nrcg,inv_ncg,pos_d;
 +    atom_id *cgindex;
 +    gmx_bool bUnbounded,bScrew;
 +
 +    ma = dd->ma;
 +    
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc,dd->nnodes);
 +        snew(tmp_ind,dd->nnodes);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +    }
 +    
 +    /* Clear the count */
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +    
 +    make_tric_corr_matrix(dd->npbcdim,box,tcm);
 +    
 +    cgindex = cgs->index;
 +    
 +    /* Compute the center of geometry for all charge groups */
 +    for(icg=0; icg<cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0],cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cg_cm);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cg_cm,pos[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for(d=DIM-1; d>=0; d--) {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for(j=d+1; j<DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while(pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(pos[k],box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while(pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(pos[k],box[d]);
 +                        if (bScrew) {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc,ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +    
 +    k1 = 0;
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for(k=0; k<ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +    
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +    
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog,"Charge group distribution at step %s:",
 +                gmx_step_str(step,buf));
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            fprintf(fplog," %d",ma->ncg[i]);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
 +                                t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    ivec npulse;
 +    int  i,cg_gl;
 +    int  *ibuf,buf2[2] = { 0, 0 };
 +    gmx_bool bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +        
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +    
 +        set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
 +    
 +        distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd,2*sizeof(int),ibuf,buf2);
 +    
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl,dd->cg_nalloc);
 +        srenew(dd->cgindex,dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +    
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int),dd->index_gl);
 +    
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        cg_gl = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Home charge groups:\n");
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fprintf(debug," %d",dd->index_gl[i]);
 +            if (i % 10 == 9) 
 +                fprintf(debug,"\n");
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,int vec,
 +                                   rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for(i=i0; i<i1; i++)
 +                {
 +                    copy_rvec(src[i],src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for(i=i0; i<i1; i++)
 +            {
 +                copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +    
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg],src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg,int *move,
 +                       int *index_gl,int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la,char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg,nat,a0,a1,a,a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for(a=a0; a<a1; a++)
 +            {
 +                a_gl = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la,a_gl,nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +    
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg,int *move,
 +                               int *index_gl,int *cgindex,int *gatindex,
 +                               gmx_ga2la_t ga2la,char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg,a0,a1,a;
 +    
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
 +    }
 +    fprintf(fplog,"distance out of cell %f\n",
 +            dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX],cm_old[YY],cm_old[ZZ]);
 +    }
 +    fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX],cm_new[YY],cm_new[ZZ]);
 +    fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
 +    fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim],comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd,step,cg,dim,dir,
 +                      bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    }
 +    print_cg_move(stderr,dd,step,cg,dim,dir,
 +                  bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state,int a)
 +{
 +    int est;
 +
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est))) {
 +            switch (est) {
 +            case estX:
 +                /* Rotate the complete state; for a rectangular box only */
 +                state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                break;
 +            case estV:
 +                state->v[a][YY] = -state->v[a][YY];
 +                state->v[a][ZZ] = -state->v[a][ZZ];
 +                break;
 +            case estSDX:
 +                state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                break;
 +            case estCGP:
 +                state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* These are distances, so not affected by rotation */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in rotate_state_atom");            
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved,comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir,matrix tcm,
 +                         rvec cell_x0,rvec cell_x1,
 +                         rvec limitd,rvec limit0,rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start,int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int  npbcdim;
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  flag;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    rvec cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(cg=cg_start; cg<cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0],cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cm_new);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cm_new,state->x[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +        
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for(d=DIM-1; d>=0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for(d2=d+1; d2<DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_dec(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_inc(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(state->x[k],state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(state->x[k],state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +    
 +        copy_rvec(cm_new,cg_cm[cg]);
 +        
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc = -1;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1) {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 +                               gmx_domdec_t *dd,ivec tric_dir,
 +                               t_state *state,rvec **f,
 +                               t_forcerec *fr,t_mdatoms *md,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int  *move;
 +    int  npbcdim;
 +    int  ncg[DIM*2],nat[DIM*2];
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  sbuf[2],rbuf[2];
 +    int  home_pos_cg,home_pos_at,buf_pos;
 +    int  flag;
 +    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    matrix tcm;
 +    rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
 +    atom_id *cgindex;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int  *moved;
 +    int  nthread,thread;
 +    
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +    
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +    
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +            case estX:   /* Always present */            break;
 +            case estV:   bV   = (state->flags & (1<<i)); break;
 +            case estSDX: bSDX = (state->flags & (1<<i)); break;
 +            case estCGP: bCGP = (state->flags & (1<<i)); break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No processing required */
 +                break;
 +            default:
 +            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +    
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int,comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +    
 +    /* Clear the count */
 +    for(c=0; c<dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(d=0; (d<DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +    
 +    make_tric_corr_matrix(npbcdim,state->box,tcm);
 +    
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
 +                     cell_x0,cell_x1,limitd,limit0,limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for(cg=0; cg<dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +    
 +    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for(i=0; i<dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +    
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +    
 +    /* Make sure the communication buffers are large enough */
 +    for(mc=0; mc<dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +    
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        /* Recalculating cg_cm might be cheaper than communicating,
 +         * but that could give rise to rounding issues.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,cg_cm,comm,bCompact);
 +    break;
 +    case ecutsVERLET:
 +        /* Without charge groups we send the moved atom coordinates
 +         * over twice. This is so the code below can be used without
 +         * many conditionals for both for with and without charge groups.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,state->x,comm,FALSE);
 +        if (bCompact)
 +        {
 +            home_pos_cg -= *ncg_moved;
 +        }
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        home_pos_cg = 0;
 +    }
 +    
 +    vec = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->x,comm,bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->v,comm,bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->sd_X,comm,bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->cg_p,comm,bCompact);
 +    }
 +    
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home,move,
 +                    dd->index_gl,dd->cgindex,dd->gatindex,
 +                    dd->ga2la,comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm,dd->ncg_home);
 +
 +            for(k=0; k<dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home,move,
 +                           dd->index_gl,dd->cgindex,dd->gatindex,
 +                           dd->ga2la,comm->bLocalCG,
 +                           moved);
 +    }
 +    
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d,dir,sbuf[0],sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +            
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int,comm->nalloc_int);
 +            }
 +            
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +            
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf,nvr+i);
 +            
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +        
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for(cg=0; cg<ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog,dd,step,cg,dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                   FALSE,0,
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for(d3=dim2+1; d3<DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl,dd->cg_nalloc);
 +                    srenew(dd->cgindex,dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state,f,home_pos_at+nrcg);
 +                }
 +                for(i=0; i<nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +    
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm,home_pos_cg);
 +
 +        for(i=dd->ncg_home; i<home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved,dd->ncg_home-*ncg_moved);
 +                
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int i;
 +    double sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +    for(i=eNR_BONDS; i<=eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}  
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +    
 +    for(i=0; i<ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i] = 0;
 +        dd->comm->cycl_n[i] = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root=NULL;
 +    int  d,dim,cid,i,pos;
 +    float cell_frac=0,sbuf[DD_NLOAD_MAX];
 +    gmx_bool bSepPME;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle,ewcDDCOMMLOAD);
 +    
 +    comm = dd->comm;
 +    
 +    bSepPME = (dd->pme_nodeid >= 0);
 +    
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 || 
 +            (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
 +                       load->load,load->nload*sizeof(float),MPI_BYTE,
 +                       0,comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum = 0;
 +                load->max = 0;
 +                load->sum_m = 0;
 +                load->cvol_min = 1;
 +                load->flags = 0;
 +                load->mdf = 0;
 +                load->pme = 0;
 +                pos = 0;
 +                for(i=0; i<dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max = max(load->max,load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m,load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min,load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf,load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme,load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcDDCOMMLOAD);
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    char  buf[STRLEN];
 +    int   npp,npme,nnodes,d,limp;
 +    float imbal,pme_f_ratio,lossf,lossp=0;
 +    gmx_bool  bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal = comm->load_max*npp/comm->load_sum - 1;
 +        lossf = dd_force_imb_perf_loss(dd);
 +        sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"%s",buf);
 +        sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"%s",buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf),"\n");
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +            sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(stderr,"\n");
 +        
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n",lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
 +{
 +    int flags,d;
 +    char buf[22];
 +    
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog," %c",dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog,"  vol min/aver %5.3f%c",
 +                dd_vol_min(dd),flags ? '!' : ' ');
 +    }
 +    fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog,"\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr,"vol %4.2f%c ",
 +                dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
 +{
 +    MPI_Comm  c_row;
 +    int  dim, i, rank;
 +    ivec loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool bPartOfGroup = FALSE;
 +    
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc,loc_c);
 +    for(i=0; i<dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank = dd_index(dd->nc,loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind],1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
 +                snew(root->old_cell_f,dd->nc[dim]+1);
 +                snew(root->bCellMin,dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0,dd->nc[dim]);
 +                    snew(root->cell_f_min1,dd->nc[dim]);
 +                    snew(root->bound_min,dd->nc[dim]);
 +                    snew(root->bound_max,dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd,dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +  int  dim0,dim1,i,j;
 +  ivec loc;
 +
 +  if (debug)
 +    fprintf(debug,"Making load communicators\n");
 +
 +  snew(dd->comm->load,dd->ndim);
 +  snew(dd->comm->mpi_comm_load,dd->ndim);
 +  
 +  clear_ivec(loc);
 +  make_load_communicator(dd,0,loc);
 +  if (dd->ndim > 1) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      make_load_communicator(dd,1,loc);
 +    }
 +  }
 +  if (dd->ndim > 2) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      dim1 = dd->dim[1];
 +      for(j=0; j<dd->nc[dim1]; j++) {
 +        loc[dim1] = j;
 +        make_load_communicator(dd,2,loc);
 +      }
 +    }
 +  }
 +
 +  if (debug)
 +    fprintf(debug,"Finished making load communicators\n");
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    gmx_bool bZYX;
 +    int  d,dim,i,j,m;
 +    ivec tmp,s;
 +    int  nzone,nzonep;
 +    ivec dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
 +        if (debug)
 +        {
 +            fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank,dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +    
-     
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
 +                dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +    case 3:
 +        nzone  = dd_z3n;
 +        nzonep = dd_zp3n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp3[i],dd_zp[i]);
 +        }
 +        break;
 +    case 2:
 +        nzone  = dd_z2n;
 +        nzonep = dd_zp2n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp2[i],dd_zp[i]);
 +        }
 +        break;
 +    case 1:
 +        nzone  = dd_z1n;
 +        nzonep = dd_zp1n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp1[i],dd_zp[i]);
 +        }
 +        break;
 +    default:
 +        gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
 +        nzone = 0;
 +        nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for(i=0; i<nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +    
 +    zones->n = nzone;
 +    for(i=0; i<nzone; i++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for(i=0; i<zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
 +        }
 +        izone = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                  izone->shift0[d] = 0;
 +                  izone->shift1[d] = 0;
 +                  for(j=izone->j0; j<izone->j1; j++) {
 +                  if (dd->shift[j][d] > dd->shift[i][d])
 +                  izone->shift0[d] = -1;
 +                  if (dd->shift[j][d] < dd->shift[i][d])
 +                  izone->shift1[d] = 1;
 +                  }
 +                */
 +                
 +                int shift_diff;
 +                
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for(j=izone->j0; j<izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root,dd->ndim);
 +    }
 +    
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank,*buf;
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +    
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
 +    
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid,dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +        
 +        MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
 +        
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +        
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc,i,dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"The master rank is %d\n",dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc,dd->rank,dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +  
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t *dd;
 +    
 +    gmx_domdec_comm_t *comm;
 +    int  *buf;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg,int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int i;
 +
 +    snew(ma,1);
 +    
 +    snew(ma->ncg,dd->nnodes);
 +    snew(ma->index,dd->nnodes+1);
 +    snew(ma->cg,ncg);
 +    snew(ma->nat,dd->nnodes);
 +    snew(ma->ibuf,dd->nnodes*2);
 +    snew(ma->cell_x,DIM);
 +    for(i=0; i<DIM; i++)
 +    {
 +        snew(ma->cell_x[i],dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf,natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank;
 +    gmx_bool bDiv[DIM];
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (comm->bCartesianPP)
 +    {
 +        for(i=1; i<DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
 +                        &comm_cart);
 +        
 +        MPI_Comm_rank(comm_cart,&rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +        
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid = rank;
 +        
 +        MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
 +        
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +        
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot,dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +        case ddnoPP_PME:
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Order of the nodes: PP first, PME last\n");
 +            }
 +            break;
 +        case ddnoINTERLEAVE:
 +            /* Interleave the PP-only and PME-only nodes,
 +             * as on clusters with dual-core machines this will double
 +             * the communication bandwidth of the PME processes
 +             * and thus speed up the PP <-> PME and inter PME communication.
 +             */
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Interleaving PP and PME nodes\n");
 +            }
 +            comm->pmenodes = dd_pmenodes(cr);
 +            break;
 +        case ddnoCARTESIAN:
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
 +        }
 +    
 +        if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int CartReorder;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    copy_ivec(dd->nc,comm->ntot);
 +    
 +    comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +    
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +    
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog,cr,dd_node_order,CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI    
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +    
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog,cr,CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug,"My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid,dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
 +{
 +    real *slb_frac,tot;
 +    int  i,n;
 +    double dbl;
 +    
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac,nc);
 +        tot = 0;
 +        for (i=0; i<nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string,"%lf%n",&dbl,&n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
 +            }
 +            slb_frac[i] = dbl;
 +            size_string += n;
 +            tot += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Relative cell sizes:");
 +        }
 +        for (i=0; i<nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog," %5.3f",slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\n");
 +        }
 +    }
 +    
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int n,nmol,ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    
 +    n = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +  }
 +
 +  return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog,const char *env_var,int def)
 +{
 +    char *val;
 +    int  nst;
 +    
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val,"%d",&nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
 +                    env_var,val,nst);
 +        }
 +    }
 +    
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n%s\n",warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n%s\n",warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
 +                                  t_inputrec *ir,FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int  di,d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for(di=0; di<dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog,t_commrec *cr,
 +                             const char *dlb_opt,gmx_bool bRecordLoad,
 +                             unsigned long Flags,t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int  eDLB=-1;
 +    char buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +    case 'a': eDLB = edlbAUTO; break;
 +    case 'n': eDLB = edlbNO;   break;
 +    case 'y': eDLB = edlbYES;  break;
 +    default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
 +            dd_warning(cr,fplog,buf);
 +        }
 +            
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +                      case edlbNO: 
 +                              break;
 +                      case edlbAUTO:
 +                              dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                              eDLB = edlbNO;
 +                              break;
 +                      case edlbYES:
 +                              dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                              break;
 +                      default:
 +                              gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
 +                              break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using domain decomposition order z, y, x\n");
 +        }
 +        for(dim=DIM-1; dim>=0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  i;
 +
 +    snew(comm,1);
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +    for(i=0; i<DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +    
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for(i=0; i<ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload   = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min,real rconstr,
 +                                        const char *dlb_opt,real dlb_scale,
 +                                        const char *sizex,const char *sizey,const char *sizez,
 +                                        gmx_mtop_t *mtop,t_inputrec *ir,
 +                                        matrix box,rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x,int *npme_y)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  recload;
 +    int  d,i,j;
 +    real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
 +    gmx_bool bC;
 +    char buf[STRLEN];
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
 +    }
 +    
 +    snew(dd,1);
 +
 +    dd->comm = init_dd_comm();
 +    comm = dd->comm;
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +    
 +    dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
 +    comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
 +    recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
 +    comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
 +    comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
 +    comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +                             
 +    }
 +    
 +    comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
 +    
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump = comm->bDynLoadBal;
++    comm->bPMELoadBalDLBLimits = FALSE;
 +    
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog,"Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort,1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +    
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +    
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +    
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
 +                                      Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b),&r_2b,cr);
 +            gmx_bcast(sizeof(r_mb),&r_mb,cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b,r_mb) > comm->cutoff)
 +                {
 +                    r_bonded       = max(r_2b,r_mb);
 +                    r_bonded_limit = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b,r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog,mtop,ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc,dd->nc);
 +        set_dd_dim(fplog,dd);
 +        set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd,ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs,comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
 +                               comm->eDLB!=edlbNO,dlb_scale,
 +                               comm->cellsize_limit,comm->cutoff,
 +                               comm->bInterCGBondeds,comm->bInterCGMultiBody);
 +        
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB!=edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes,limit,buf);
 +        }
 +        set_dd_dim(fplog,dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
 +    }
 +    
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }    
 +        if (fplog)
 +        {
 +            fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x,comm->npmenodes_y,1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +    
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +        
 +    snew(comm->slb_frac,DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs = average_cellsize_min(dd,ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm,comm->cellsize_limit);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr,dd,ir,fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    real cellsize_min;
 +    int  d,nc,i;
 +    char buf[STRLEN];
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump = TRUE;
- gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
-                           real cutoff_req)
++
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for(i=0; i<nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int  ncg,cg;
 +    char *bLocalCG;
 +    
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG,ncg);
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,gmx_constr_t constr,
 +                     t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bBondComm;
 +    int  d;
 +
 +    dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal,real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec np;
 +    real limit,shrink;
 +    char buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog,"The maximum number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
 +        fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
 +        fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog," %c %.2f",dim2char(d),shrink);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
 +        fprintf(fplog,"The initial number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The initial domain decomposition cell size is:");
 +        for(d=0; d<DIM; d++) {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog," %c %.2f nm",
 +                        dim2char(d),dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog,"\n\n");
 +    }
 +    
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions","",comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox,ir))
 +            {
 +                fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for(d=1; d<DIM; d++)
 +            {
 +                limit = min(limit,dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions","(-rdd)",
 +                    max(comm->cutoff,comm->cutoff_mbody));
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions","(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions","(-rcon)",limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf,"atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    buf,"(-rcon)",limit);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t *dd,
 +                                real dlb_scale,
 +                                const t_inputrec *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim,npulse,npulse_d_max,npulse_d;
 +    gmx_bool bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +        
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +        
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max,npulse_d);
 +        }
 +        npulse = min(npulse,npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX]>1 &&
 +              dd->nc[YY]>1 &&
 +              (dd->nc[ZZ]>1 || ePBC==epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
 +                       t_inputrec *ir,t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  natoms_tot;
 +    real vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth,comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd,&comm->ddpme[0],0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd,&comm->ddpme[1],1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +        
 +    if (debug)
 +    {
 +        fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
 +    }
 +    
 +    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +   
 +    dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
 +}
 +
-     dd->comm->cutoff = cutoff_req;
++static gmx_bool test_dd_cutoff(t_commrec *cr,
++                               t_state *state,t_inputrec *ir,
++                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t ddbox;
 +    int d,dim,np;
 +    real inv_cell_size;
 +    int LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd,FALSE,cr,ir,state->box,
 +              TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox,ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
 +        {
 +            LocallyLimited = 1; 
 +        }
 +
 +        gmx_sumi(1,&LocallyLimited,cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
++gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
++                          real cutoff_req)
++{
++    gmx_bool bCutoffAllowed;
++
++    bCutoffAllowed = test_dd_cutoff(cr,state,ir,cutoff_req);
++
++    if (bCutoffAllowed)
++    {
++        cr->dd->comm->cutoff = cutoff_req;
++    }
++
++    return bCutoffAllowed;
++}
++
++void change_dd_dlb_cutoff_limit(t_commrec *cr)
++{
++    gmx_domdec_comm_t *comm;
++
++    comm = cr->dd->comm;
++
++    /* Turn on the DLB limiting (might have been on already) */
++    comm->bPMELoadBalDLBLimits = TRUE;
++
++    /* Change the cut-off limit */
++    comm->PMELoadBal_max_cutoff = comm->cutoff;
++}
++
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb,int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind,*ind_p;
 +    int p,cell,c,cg,cg0,cg1,cg_gl,nat;
 +    int shift,shift_at;
 +    
 +    ind = &cd->ind[pulse];
 +    
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for(cell=ncell-1; cell>=0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0 = ncg_cell[ncell+cell];
 +            cg1 = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for(cg=cg1-1; cg>=cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift] = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for(p=1; p<=pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0 = 0;
 +                for(c=0; c<cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for(cg=cg0; cg<cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift = 0;
 +    shift_at = 0;
 +    cg0 = 0;
 +    for(cell=0; cell<ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for(cg=0; cg<ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0],cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl = index_gl[cg1];
 +            cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
 +            nat = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone,int cg0,const int *cgindex)
 +{
 +    int cg,zone,p;
 +    
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        for(p=0; p<cd->np; p++) {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
 +{
 +    int  i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i,j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +        
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for(j=0; j<4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for(i=0; i<zones->nizone; i++)
 +                {
 +                    for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for(i=0; i<2; i++)
 +                    {
 +                        for(j=0; j<2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bScrew;
 +    gmx_bool bDistMB_pulse;
 +    int  cg,i;
 +    real r2,rb2,r,tric_sh;
 +    rvec rn,rb;
 +    int  dimd;
 +    int  nsend_z,nsend,nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for(cg=cg0; cg<cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for(i=dim0+1; i<DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2 = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for(i=1; i<=dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh = 0;
 +                for(i=dim1+1; i<DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh = 0;
 +            for(i=dim+1; i<DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +        
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink,index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index,ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf,*ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend] = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf,nsend+1);
 +            
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg],vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box,gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr,t_state *state,rvec **f)
 +{
 +    int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
 +    int nzone,nzone_send,zone,zonei,cg0,cg1;
 +    int c,i,j,cg,cg_gl,nrcg;
 +    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
 +    real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
 +    dd_corners_t corners;
 +    ivec tric_dist;
 +    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
 +    real skew_fac2_d,skew_fac_01;
 +    rvec sf2_round;
 +    int  nsend,nat;
 +    int  th;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Setting up DD communication\n");
 +    }
 +    
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        cg_cm = fr->cg_cm;
 +        break;
 +    case ecutsVERLET:
 +        cg_cm = state->x;
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        cg_cm = NULL;
 +    }
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for(i=0; i<=dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +    
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +    
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
 +    
 +    /* Triclinic stuff */
 +    normal = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +    
 +    zone_cg_range = zones->cg_range;
 +    index_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    cginfo_mb = fr->cginfo_mb;
 +    
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +    
 +    nat_tot = dd->nat_home;
 +    nzone = 1;
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd = &comm->cd[dim_ind];
 +        
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for(p=0; p<cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind = &cd->ind[p];
 +            nsend = 0;
 +            nat = 0;
 +            for(zone=0; zone<nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for(dimd=0; dimd<dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for(i=dd->dim[dimd]+1; i<DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for(th=0; th<comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int **ibuf_p,*ibuf_nalloc_p;
 +                    vec_rvec_t *vbuf_p;
 +                    int *nsend_p,*nat_p;
 +                    int *nsend_zone_p;
 +                    int cg0_th,cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +                    
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
 +                                       index_gl,cgindex,
 +                                       dim,dim_ind,dim0,dim1,dim2,
 +                                       r_comm2,r_bcomm2,
 +                                       box,tric_dist,
 +                                       normal,skew_fac2_d,skew_fac_01,
 +                                       v_d,v_0,v_1,&corners,sf2_round,
 +                                       bDistBonded,bBondComm,
 +                                       bDist2B,bDistMB,
 +                                       cg_cm,fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p,ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p,nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for(th=1; th<comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int i,ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index,ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int,comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v,comm->vbuf.nalloc);
 +                    }
 +
 +                    for(i=0; i<dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend] = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for(zone=nzone_send; zone<nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +            
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for(zone=0; zone<nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2,comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2,i);
 +                }
 +            }
 +            
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl,dd->cg_nalloc);
 +                srenew(cgindex,dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +            
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for(cg=0; cg<ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
 +                        nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone,cd,p,zone_cg_range,
 +                                 index_gl,recv_i,cg_cm,recv_vr,
 +                                 cgindex,fr->cginfo_mb,fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +    
 +    dd->ncg_tot = zone_cg_range[zones->n];
 +    dd->nat_tot = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for(i=ddnatZONE; i<ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
 +                      NULL,comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Finished setting up DD communication, zones:");
 +        for(c=0; c<zones->n; c++)
 +        {
 +            fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +    
 +    for(c=0; c<zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box,const gmx_ddbox_t *ddbox,
 +                           int zone_start,int zone_end)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool bDistMB;
 +    int  z,zi,zj0,zj1,d,dim;
 +    real rcs,rcmbs;
 +    int  i,j;
 +    real size_j,add_tric;
 +    real vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0,zones->size[z].x0);
 +        copy_rvec(comm->cell_x1,zones->size[z].x1);
 +    }
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for(z=0; z<zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for(zi=0; zi<zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for(zi=0; zi<zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min={0,0,0},corner_max={0,0,0},corner;
 +        int  nc,c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for(c=0; c<nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for(i=YY; i<ddbox->npbcdim; i++)
 +            {
 +                for(j=XX; j<i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner,corner_min);
 +                copy_rvec(corner,corner_max);
 +            }
 +            else
 +            {
 +                for(i=0; i<DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i],corner[i]);
 +                    corner_max[i] = max(corner_max[i],corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for(i=0; i<DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX],zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY],zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
 +            fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a,const void *b)
 +{
 +    int comp;
 +    
 +    gmx_cgsort_t *cga,*cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +    
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +    
 +    return comp;
 +}
 +
 +static void order_int_cg(int n,const gmx_cgsort_t *sort,
 +                         int *a,int *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n,const gmx_cgsort_t *sort,
 +                         rvec *v,rvec *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind],buf[i]);
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(buf[i],v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
 +                           rvec *v,rvec *buf)
 +{
 +    int a,atot,cg,cg0,cg1,i;
 +    
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg,sort,v,buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for(i=cg0; i<cg1; i++)
 +        {
 +            copy_rvec(v[i],buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +    
 +    /* Copy back to the original array */
 +    for(a=0; a<atot; a++)
 +    {
 +        copy_rvec(buf[a],v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
 +                         int nsort_new,gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1,i2,i_new;
 +    
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
 +    
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1 = 0;
 +    i2 = 0;
 +    i_new = 0;
 +    while(i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
 +    int  sort_last,sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new = 0;
 +        nsort2 = 0;
 +        nsort_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new,sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2,nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort = sort->sort;
 +        ncg_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int  ncg_new,i,*a,na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
 +
 +    ncg_new = 0;
 +    for(i=0; i<na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
 +                          rvec *cgcm,t_forcerec *fr,t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  *cgindex;
 +    int  ncg_new,i,*ibuf,cgsize;
 +    rvec *vbuf;
 +    
 +    sort = dd->comm->sort;
 +    
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort,sort->sort_nalloc);
 +        srenew(sort->sort2,sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        ncg_new = dd_sort_order(dd,fr,ncg_home_old);
 +        break;
 +    case ecutsVERLET:
 +        ncg_new = dd_sort_order_nbnxn(dd,fr);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +    
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug,"Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +    
 +    /* Reorder the state */
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +            case estX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
 +                break;
 +            case estV:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
 +                break;
 +            case estSDX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
 +                break;
 +            case estCGP:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No ordering required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
 +    }
 +    
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf,sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +    
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload = 0;
 +    comm->load_step = 0;
 +    comm->load_sum = 0;
 +    comm->load_max = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    double av;
 +   
 +    comm = cr->dd->comm;
 +    
 +    gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
 +    
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +    
 +    fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +            
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch(ddnat)
 +        {
 +        case ddnatZONE:
 +            fprintf(fplog,
 +                    " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                    2,av);
 +            break;
 +        case ddnatVSITE:
 +            if (cr->dd->vsite_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                        (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
 +                        av);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (cr->dd->constraint_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                        1 + ir->nLincsIter,av);
 +            }
 +            break;
 +        default:
 +            gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog,"\n");
 +    
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog,cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE            *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec       *cr,
 +                         gmx_bool            bMasterState,
 +                         int             nstglobalcomm,
 +                         t_state         *state_global,
 +                         gmx_mtop_t      *top_global,
 +                         t_inputrec      *ir,
 +                         t_state         *state_local,
 +                         rvec            **f,
 +                         t_mdatoms       *mdatoms,
 +                         gmx_localtop_t  *top_local,
 +                         t_forcerec      *fr,
 +                         gmx_vsite_t     *vsite,
 +                         gmx_shellfc_t   shellfc,
 +                         gmx_constr_t    constr,
 +                         t_nrnb          *nrnb,
 +                         gmx_wallcycle_t wcycle,
 +                         gmx_bool            bVerbose)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t ddbox={0};
 +    t_block *cgs_gl;
 +    gmx_large_int_t step_pcoupl;
 +    rvec cell_ns_x0,cell_ns_x1;
 +    int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
 +    gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
 +    gmx_bool bRedist,bSortCG,bResortAll;
 +    ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
 +    real grid_density;
 +    char sbuf[22];
 +      
 +    dd = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n = max(100,nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +        
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd,wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog,dd,step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB) {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug,"step %s, imb loss %f\n",
 +                                gmx_step_str(step,sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog,cr,step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_global->box,
 +                  TRUE,cgs_gl,state_global->x,&ddbox);
 +    
 +        get_cg_distribution(fplog,step,dd,cgs_gl,
 +                            state_global->box,&ddbox,state_global->x);
 +        
 +        dd_distribute_state(dd,cgs_gl,
 +                            state_global,state_local,f);
 +        
 +        dd_make_local_cgs(dd,&top_local->cgs);
 +        
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +        
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
 +        }
 +        
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
 +        }
 +        
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +        
 +        /* Build the new indices */
 +        rebuild_cgindex(dd,cgs_gl->index,state_local);
 +        make_dd_indices(dd,cgs_gl->index,0);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  TRUE,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0    ,ddbox.box0    );
 +            copy_rvec(comm->box_size,ddbox.box_size);
 +        }
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0    ,comm->box0    );
 +    copy_rvec(ddbox.box_size,comm->box_size);
 +    
 +    set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
 +                      step,wcycle);
 +    
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
 +    }
 +    
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
 +                           state_local,f,fr,mdatoms,
 +                           !bSortCG,nrnb,&cg0,&ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
 +    }
 +    
 +    get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
 +                          dd,&ddbox,
 +                          &comm->cell_x0,&comm->cell_x1,
 +                          dd->ncg_home,fr->cg_cm,
 +                          cell_ns_x0,cell_ns_x1,&grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        copy_ivec(fr->ns.grid->n,ncells_old);
 +        grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
 +                   state_local->box,cell_ns_x0,cell_ns_x1,
 +                   fr->rlistlong,grid_density);
 +        break;
 +    case ecutsVERLET:
 +        nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir,comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +        
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +        case ecutsVERLET:
 +            set_zones_size(dd,state_local->box,&ddbox,0,1);
 +
 +            nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
 +                              0,
 +                              comm->zones.size[0].bb_x0,
 +                              comm->zones.size[0].bb_x1,
 +                              0,dd->ncg_home,
 +                              comm->zones.dens_zone0,
 +                              fr->cginfo,
 +                              state_local->x,
 +                              ncg_moved,bRedist ? comm->moved : NULL,
 +                              fr->nbv->grp[eintLocal].kernel_type,
 +                              fr->nbv->grp[eintLocal].nbat);
 +
 +            nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
 +            break;
 +        case ecutsGROUP:
 +            fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
 +                      0,dd->ncg_home,fr->cg_cm);
 +            
 +            copy_ivec(fr->ns.grid->n,ncells_new);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +   
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step,sbuf),dd->ncg_home);
 +        }
 +        dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        cg0 = 0;
 +        ga2la_clear(dd->ga2la);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
 +    
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
 +    
 +    /* Set the indices */
 +    make_dd_indices(dd,cgs_gl->index,cg0);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd,state_local->box,&ddbox,
 +                       bSortCG ? 1 : 0,comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
 +
 +    /*
 +    write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +    */
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
 +    
 +    /* Extract a local topology from the global topology */
 +    for(i=0; i<dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
 +                      comm->cellsize_min,np,
 +                      fr,
 +                      fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite,top_global,top_local);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
 +    
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for(i=ddnatZONE+1; i<ddnatNR; i++)
 +    {
 +        switch(i)
 +        {
 +        case ddnatVSITE:
 +            if (vsite && vsite->n_intercg_vsite)
 +            {
 +                n = dd_make_local_vsites(dd,n,top_local->idef.il);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (dd->bInterCGcons || dd->bInterCGsettles)
 +            {
 +                /* Only for inter-cg constraints we need special code */
 +                n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
 +                                              constr,ir->nProjOrder,
 +                                              top_local->idef.il);
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
 +                        dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global,ir,
 +             comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd,mdatoms,top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr,mdatoms,shellfc);
 +    }
 +    
 +      if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr,fr->born,ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr,&top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA,mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
 +    }
 +    
 +    if (constr)
 +    {
 +        set_constraints(constr,top_local,ir,mdatoms,cr);
 +    }
 +    
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd,ir->pull,mdatoms);
 +    }
 +    
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd,ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +    
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +    
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd,state_local->box,state_local->x);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
 +    
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd,state_local->box,state_local->x);
 +        write_dd_pdb("dd_dump",step,"dump",top_global,cr,
 +                     -1,state_local->x,state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +    
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index 8ed1a583a0c5ebc7c6febd83816b4d57c36691eb,0000000000000000000000000000000000000000..9eb75af90711e5a2704f98e89b042b125be7d18e
mode 100644,000000..100644
--- /dev/null
@@@ -1,2773 -1,0 +1,2784 @@@
-                               gmx_bool *bUseGPU,
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +  t_forcerec *fr;
 +  
 +  snew(fr,1);
 +  
 +  return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp,real *nbfp,gmx_bool bBHAM,int atnr)
 +{
 +  int i,j;
 +  
 +  for(i=0; (i<atnr); i++) {
 +    for(j=0; (j<atnr); j++) {
 +      fprintf(fp,"%2d - %2d",i,j);
 +      if (bBHAM)
 +      fprintf(fp,"  a=%10g, b=%10g, c=%10g\n",BHAMA(nbfp,atnr,i,j),
 +              BHAMB(nbfp,atnr,i,j),BHAMC(nbfp,atnr,i,j)/6.0);
 +      else
 +      fprintf(fp,"  c6=%10g, c12=%10g\n",C6(nbfp,atnr,i,j)/6.0,
 +            C12(nbfp,atnr,i,j)/12.0);
 +    }
 +  }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef,gmx_bool bBHAM)
 +{
 +  real *nbfp;
 +  int  i,j,k,atnr;
 +  
 +  atnr=idef->atnr;
 +  if (bBHAM) {
 +    snew(nbfp,3*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          BHAMA(nbfp,atnr,i,j) = idef->iparams[k].bham.a;
 +          BHAMB(nbfp,atnr,i,j) = idef->iparams[k].bham.b;
 +          /* nbfp now includes the 6.0 derivative prefactor */
 +          BHAMC(nbfp,atnr,i,j) = idef->iparams[k].bham.c*6.0;
 +      }
 +    }
 +  }
 +  else {
 +    snew(nbfp,2*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +          C6(nbfp,atnr,i,j)   = idef->iparams[k].lj.c6*6.0;
 +          C12(nbfp,atnr,i,j)  = idef->iparams[k].lj.c12*12.0;
 +      }
 +    }
 +  }
 +
 +  return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the 
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in 
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array. 
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + * 
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm 
 + */
 +
 +typedef struct 
 +{
 +    int    model;          
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t   *molt,
 +                 int                   cg0,
 +                 int                   nmol,
 +                 const unsigned char   *qm_grpnr,
 +                 const t_grps          *qm_grps,
 +                 t_forcerec *          fr,
 +                 int                   *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                   cginfo,
 +                 int                   *cg_sp)
 +{
 +    const t_blocka *  excl;
 +    t_atom            *atom;
 +    int               j,k;
 +    int               j0,j1,nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real              tmp_charge[4];
 +    int               tmp_vdwtype[4];
 +    int               tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type. 
 +     * Every time we discover a new molecule that fulfills the basic 
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */   
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +    
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +    
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug) {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name,nj);
 +    }
 +    
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj<3 || nj>4)
 +    {
 +        return;
 +    }
 +    
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE; 
 +    if (qm_grpnr != NULL)
 +    {
 +        for(j=j0 ; j<j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +    
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +    
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */   
 +    perturbed = FALSE; 
 +    
 +    for(j=j0; j<j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +    
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +    
 +    /* Now it's only a question if the VdW and charge parameters 
 +     * are OK. Before doing the check we compare and see if they are 
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.    
 +     */
 +    for(j=0; j<nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    } 
 +    
 +    /* Does it match any previous solvent type? */
 +    for(k=0 ; k<*n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +        
 +        
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if( (solvent_parameters[k].model==esolSPC   && nj!=3)  ||
 +            (solvent_parameters[k].model==esolTIP4P && nj!=4) )
 +            match = FALSE;
 +        
 +        /* Check that types & charges match for all atoms in molecule */
 +        for(j=0 ; j<nj && match==TRUE; j++)
 +        {                     
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if(tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +    
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.   
 +     */
 +    for(j=0; j<nj; j++) 
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +        
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */   
 +        for(k=0; k<fr->ntype && (has_vdw[j]==FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */ 
 +            if (fr->bBHAM) 
 +            {
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (BHAMA(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMB(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMC(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (C6(fr->nbfp,fr->ntype,tjA,k)  != 0.0) ||
 +                              (C12(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +        }
 +    }
 +    
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge, 
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 might have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<3;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj==4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1. 
 +         * Only atom 1 mght have VdW.
 +         */
 +        if(has_vdw[1] == FALSE &&
 +           has_vdw[2] == FALSE &&
 +           has_vdw[3] == FALSE &&
 +           tmp_charge[0]  == 0 &&
 +           tmp_charge[1]  != 0 &&
 +           tmp_charge[2]  == tmp_charge[1] &&
 +           tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<4;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +            
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE *                fp,
 +              const gmx_mtop_t *    mtop,
 +              t_forcerec *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block *   cgs;
 +    const t_block *   mols;
 +    const gmx_moltype_t *molt;
 +    int               mb,mol,cg_mol,at_offset,cg_offset,am,cgm,i,nmol_ch,nmol;
 +    int               n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int               **cg_sp;
 +    int               bestsp,bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp,mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb],cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for(mol=0; mol<nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for(cg_mol=0; cg_mol<cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt,cg_mol,nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters,&solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */   
 +    
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for(i=0;i<n_solvent_parameters;i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +    
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +    
 +#ifdef DISABLE_WATER_NLIST
 +      bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for(i=0; i<cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +    
 +    if (bestsol != esolNO && fp!=NULL)
 +    {
 +        fprintf(fp,"\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum { acNONE=0, acCONSTRAINT, acSETTLE };
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr,gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block *cgs;
 +    const t_blocka *excl;
 +    const gmx_moltype_t *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool *type_VDW;
 +    int  *cginfo;
 +    int  cg_offset,a_offset,cgm,am;
 +    int  mb,m,ncg_tot,cg,a0,a1,gid,ai,j,aj,excl_nalloc;
 +    int  *a_con;
 +    int  ftype;
 +    int  ia;
 +    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter,bHaveVDW,bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb,mtop->nmolblock);
 +
 +    snew(type_VDW,fr->ntype);
 +    for(ai=0; ai<fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for(j=0; j<fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp,fr->ntype,ai,j) != 0 ||
 +                C12(fr->nbfp,fr->ntype,ai,j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl,excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am = 0;
 +        for(m=0; m<molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups,egcENER,a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups,egcENER,a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for(ai=a0; ai<a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo,cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con,molt->atoms.nr);
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for(ia=0; ia<molt->ilist[ftype].nr; ia+=1+nral)
 +                {
 +                    int a;
 +
 +                    for(a=0; a<nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for(m=0; m<(bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups,egcENER,a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg],gid);
 +                
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc) {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl,excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for(ai=a0; ai<a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for(j=excl->index[ai]; j<excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                    case acCONSTRAINT:
 +                        SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                        break;
 +                    case acSETTLE:
 +                        SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                        break;
 +                    default:
 +                        break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS,"A charge group has size %d which is larger than the limit of %d atoms",a1-a0,MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg],a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +    
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog,mtop,fr,cginfo_mb);
 +    
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            for(cg=0; cg<cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg],esolNO);
 +            }
 +        }
 +    }
 +    
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb,cginfo_mb_t *cgi_mb)
 +{
 +    int ncg,mb,cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo,ncg);
 +    mb = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    double qsum,q2sum,q;
 +    int    mb,nmol,i;
 +    const t_atoms *atoms;
 +    
 +    qsum  = 0;
 +    q2sum = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for(i=0; i<atoms->nr; i++)
 +        {
 +            q = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for(i=0; i<atoms->nr; i++)
 +            {
 +                q = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log) {
 +        if (fr->efep == efepNO)
 +            fprintf(log,"System total charge: %.3f\n",fr->qsum[0]);
 +        else
 +            fprintf(log,"System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0],fr->qsum[1]);
 +    }
 +}
 +
 +void update_forcerec(FILE *log,t_forcerec *fr,matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *atoms,*atoms_tpi;
 +    const t_blocka *excl;
 +    int    mb,nmol,nmolc,i,j,tpi,tpj,j1,j2,k,n,nexcl,q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)    
 +    long long int  npair,npair_ij,tmpi,tmpj;
 +#else
 +    double npair, npair_ij,tmpi,tmpj;
 +#endif
 +    double csix,ctwelve;
 +    int    ntp,*typecount;
 +    gmx_bool   bBHAM;
 +    real   *nbfp;
 +
 +    ntp = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp = fr->nbfp;
 +    
 +    for(q=0; q<(fr->efep==efepNO ? 1 : 2); q++) {
 +        csix = 0;
 +        ctwelve = 0;
 +        npair = 0;
 +        nexcl = 0;
 +        if (!fr->n_tpi) {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount,ntp);
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(i=0; i<atoms->nr; i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for(tpi=0; tpi<ntp; tpi++) {
 +                for(tpj=tpi; tpj<ntp; tpj++) {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM) {
 +                        /* nbfp now includes the 6.0 derivative prefactor */
 +                        csix    += npair_ij*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                    } else {
 +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                        csix    += npair_ij*   C6(nbfp,ntp,tpi,tpj)/6.0;
 +                        ctwelve += npair_ij*  C12(nbfp,ntp,tpi,tpj)/12.0;
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for(i=0; (i<atoms->nr); i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for(j=j1; j<j2; j++) {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM) {
 +                                /* nbfp now includes the 6.0 derivative prefactor */
 +                               csix -= nmol*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                            } else {
 +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                                csix    -= nmol*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                                ctwelve -= nmol*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        } else {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(j=0; j<atoms->nr; j++) {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +                        
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS,"Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for(i=0; i<fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            /* nbfp now includes the 6.0 derivative prefactor */
 +                            csix    += nmolc*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                        }
 +                        else
 +                        {
 +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                            csix    += nmolc*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                            ctwelve += nmolc*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog) {
 +            fprintf(fplog,"\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        } else {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug) {
 +            fprintf(debug,"Counted %d exclusions\n",nexcl);
 +            fprintf(debug,"Average C6 parameter is: %10g\n",(double)csix);
 +            fprintf(debug,"Average C12 parameter is: %10g\n",(double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0],fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e\n",fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog,t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1,*at2;
 +    int  mt1,mt2,i,j,tpi,tpj,ntypes;
 +    real b,bmin;
 +    real *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +    
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for(mt1=0; mt1<mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for(i=0; (i<at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +                gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",i,tpi,ntypes);
 +            
 +            for(mt2=mt1; mt2<mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for(j=0; (j<at2->nr); j++) {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",j,tpj,ntypes);
 +                    }
 +                    b = BHAMB(nbfp,ntypes,tpi,tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin==-1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Buckingham b parameters, min: %g, max: %g\n",
 +                bmin,fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,real rtab,
 +                            const t_commrec *cr,
 +                            const char *tabfn,char *eg1,char *eg2,
 +                            t_nblists *nbl)
 +{
 +    char buf[STRLEN];
 +    int i,j;
 +
 +    if (tabfn == NULL) {
 +        if (debug)
 +            fprintf(debug,"No table file name passed, can not read table, can not do non-bonded interactions\n");
 +        return;
 +    }
 +
 +    sprintf(buf,"%s",tabfn);
 +    if (eg1 && eg2)
 +    /* Append the two energy group names */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"_%s_%s.%s",
 +                eg1,eg2,ftp2ext(efXVG));
 +    nbl->table_elec_vdw = make_tables(fp,oenv,fr,MASTER(cr),buf,rtab,0);
 +    /* Copy the contents of the table to separate coulomb and LJ tables too,
 +     * to improve cache performance.
 +     */
 +    /* For performance reasons we want
 +     * the table data to be aligned to 16-byte. The pointers could be freed
 +     * but currently aren't.
 +     */
 +    nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
 +    nbl->table_elec.format = nbl->table_elec_vdw.format;
 +    nbl->table_elec.r = nbl->table_elec_vdw.r;
 +    nbl->table_elec.n = nbl->table_elec_vdw.n;
 +    nbl->table_elec.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_elec.ninteractions = 1;
 +    nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
 +    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),32);
 +
 +    nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
 +    nbl->table_vdw.format = nbl->table_elec_vdw.format;
 +    nbl->table_vdw.r = nbl->table_elec_vdw.r;
 +    nbl->table_vdw.n = nbl->table_elec_vdw.n;
 +    nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_vdw.ninteractions = 2;
 +    nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
 +    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),32);
 +
 +    for(i=0; i<=nbl->table_elec_vdw.n; i++)
 +    {
 +        for(j=0; j<4; j++)
 +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
 +        for(j=0; j<8; j++)
 +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
 +    }
 +}
 +
 +static void count_tables(int ftype1,int ftype2,const gmx_mtop_t *mtop,
 +                         int *ncount,int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist *il;
 +    int mt,ftype,stride,i,j,tabnr;
 +    
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2) {
 +                il = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for(i=0; i<il->nr; i+=stride) {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                        gmx_fatal(FARGS,"A bonded table number is smaller than 0: %d\n",tabnr);
 +                    if (tabnr >= *ncount) {
 +                        srenew(*count,tabnr+1);
 +                        for(j=*ncount; j<tabnr+1; j++)
 +                            (*count)[j] = 0;
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1,int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn,const char *tabext)
 +{
 +    int  i,ncount,*count;
 +    char tabfn[STRLEN];
 +    bondedtable_t *tab;
 +    
 +    tab = NULL;
 +    
 +    ncount = 0;
 +    count = NULL;
 +    count_tables(ftype1,ftype2,mtop,&ncount,&count);
 +    
 +    if (ncount > 0) {
 +        snew(tab,ncount);
 +        for(i=0; i<ncount; i++) {
 +            if (count[i] > 0) {
 +                sprintf(tabfn,"%s",basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1,"_%s%d.%s",
 +                        tabext,i,ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog,tabfn,NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +  
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home,int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr,int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin,fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc,fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,const t_inputrec *ir,
 +                          const char *tabfn, const gmx_mtop_t *mtop,
 +                            matrix     box)
 +{
 +  char buf[STRLEN];
 +  int i,j;
 +
 +  if (tabfn == NULL) {
 +        gmx_fatal(FARGS,"No thermoforce table file given. Use -tabletf to specify a file\n");
 +    return;
 +  }
 +
 +  snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +  for (i=0; i<ir->adress->n_tf_grps; i++){
 +    j = ir->adress->tf_table_index[i]; /* get energy group index */
 +    sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"tf_%s.%s",
 +        *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]) ,ftp2ext(efXVG));
 +    printf("loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[j], buf);
 +    fr->atf_tabs[i] = make_atf_table(fp,oenv,fr,buf, box);
 +  }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                      gmx_bool bPrintNote,t_commrec *cr,FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +         ir->rlist==0            &&
 +         ir->rcoulomb==0         &&
 +         ir->rvdw==0             &&
 +         ir->ePBC==epbcNONE      &&
 +         ir->vdwtype==evdwCUT    &&
 +         ir->coulombtype==eelCUT &&
 +         ir->efep==efepNO        &&
 +         (ir->implicit_solvent == eisNO || 
 +          (ir->implicit_solvent==eisGBSA && (ir->gb_algorithm==egbSTILL || 
 +                                             ir->gb_algorithm==egbHCT   || 
 +                                             ir->gb_algorithm==egbOBC))) &&
 +         getenv("GMX_NO_ALLVSALL") == NULL
 +            );
 +    
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note="NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,"\n%s\n",note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp,"\n%s\n",note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if(bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp,"\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +    
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 +{
 +    int t,i;
 +
 +    /* These thread local data structures are used for bondeds only */
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t,fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for(t=1; t<fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift,SHIFTS);
 +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 +            for(i=0; i<egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i],fr->f_t[t].grpp.nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE *fp,
 +                                  const t_commrec *cr,
 +                                  const gmx_cpuid_t cpuid_info,
 +                                  const t_inputrec *ir,
 +                                  int *kernel_type,
 +                                  int *ewald_excl)
 +{
 +    *kernel_type = nbnxnk4x4_PlainC;
 +    *ewald_excl  = ewaldexclTable;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +        *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +        /* We expect the 2xNN kernels to be faster in most cases */
 +        *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#endif
 +
 +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
 +        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
 +        {
 +            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
 +             * 10% with HT, 50% without HT, but extra zeros interactions
 +             * can compensate. As we currently don't detect the actual use
 +             * of HT, switch to 4x8 to avoid a potential performance hit.
 +             */
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +        }
 +#endif
 +        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#else
 +            gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
 +        {
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#else
 +            gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +
 +        /* Analytical Ewald exclusion correction is only an option in the
 +         * x86 SIMD kernel. This is faster in single precision
 +         * on Bulldozer and slightly faster on Sandy Bridge.
 +         */
 +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
 +        *ewald_excl = ewaldexclAnalytical;
 +#endif
 +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
 +        {
 +            *ewald_excl = ewaldexclTable;
 +        }
 +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
 +        {
 +            *ewald_excl = ewaldexclAnalytical;
 +        }
 +
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +const char *lookup_nbnxn_kernel_name(int kernel_type)
 +{
 +    const char *returnvalue = NULL;
 +    switch(kernel_type)
 +    {
 +    case nbnxnkNotSet: returnvalue = "not set"; break;
 +    case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
 +#ifndef GMX_NBNXN_SIMD
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
 +#else
 +#ifdef GMX_X86_SSE2
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +        /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
 +         * on compiler flags. As we use nearly identical intrinsics, using an AVX
 +         * compiler flag without an AVX macro effectively results in AVX kernels.
 +         * For gcc we check for __AVX__
 +         * At least a check for icc should be added (if there is a macro)
 +         */
 +#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
 +#else
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
 +#endif
 +#else
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
 +#endif
 +#endif
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
 +#endif
 +#else /* not GMX_X86_SSE2 */
 +    case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
 +    case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
 +#endif
 +#endif
 +    case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
 +    case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
 +
 +    case nbnxnkNR:
 +    default:
 +        gmx_fatal(FARGS, "Illegal kernel type selected");
 +        returnvalue = NULL;
 +        break;
 +    }
 +    return returnvalue;
 +};
 +
 +static void pick_nbnxn_kernel(FILE *fp,
 +                              const t_commrec *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool use_cpu_acceleration,
-     gmx_bool bEmulateGPU, bGPU, bEmulateGPUEnvVarSet;
-     char gpu_err_str[STRLEN];
++                              gmx_bool bUseGPU,
++                              gmx_bool bEmulateGPU,
 +                              const t_inputrec *ir,
 +                              int *kernel_type,
 +                              int *ewald_excl,
 +                              gmx_bool bDoNonbonded)
 +{
-     bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
-     /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
-     bGPU = ((bUseGPU != NULL) && hwinfo->bCanUseGPU);
-     /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. We will
-      * automatically switch to emulation if non-bonded calculations are
-      * turned off via GMX_NO_NONBONDED - this is the simple and elegant
-      * way to turn off GPU initialization, data movement, and cleanup. */
-     bEmulateGPU = (bEmulateGPUEnvVarSet || (!bDoNonbonded && bGPU));
-     /* Enable GPU mode when GPUs are available or GPU emulation is requested.
-      * The latter is useful to assess the performance one can expect by adding
-      * GPU(s) to the machine. The conditional below allows this even if mdrun
-      * is compiled without GPU acceleration support.
-      * Note that such a GPU acceleration performance assessment should be
-      * carried out by setting the GMX_EMULATE_GPU and GMX_NO_NONBONDED env. vars
-      * (and freezing the system as otherwise it would explode). */
-     if (bGPU || bEmulateGPUEnvVarSet)
-     {
-         if (bEmulateGPU)
-         {
-             bGPU = FALSE;
-         }
-         else
-         {
-             /* Each PP node will use the intra-node id-th device from the
-              * list of detected/selected GPUs. */
-             if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
-             {
-                 /* At this point the init should never fail as we made sure that
-                  * we have all the GPUs we need. If it still does, we'll bail. */
-                 gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
-                           cr->nodeid,
-                           get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
-                           gpu_err_str);
-             }
-         }
-         *bUseGPU = bGPU;
-     }
 +    assert(kernel_type);
 +
 +    *kernel_type = nbnxnkNotSet;
 +    *ewald_excl  = ewaldexclTable;
 +
-     else if (bGPU)
 +    if (bEmulateGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_PlainC;
 +
 +        if (bDoNonbonded)
 +        {
 +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +        }
 +    }
-     gmx_bool bHybridGPURun = FALSE;
++    else if (bUseGPU)
 +    {
 +        *kernel_type = nbnxnk8x8x8_CUDA;
 +    }
 +
 +    if (*kernel_type == nbnxnkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
 +            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
 +                                  kernel_type,ewald_excl);
 +        }
 +        else
 +        {
 +            *kernel_type = nbnxnk4x4_PlainC;
 +        }
 +    }
 +
 +    if (bDoNonbonded && fp != NULL)
 +    {
 +        fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
 +                lookup_nbnxn_kernel_name(*kernel_type),
 +                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
 +                nbnxn_kernel_to_cj_size(*kernel_type));
 +    }
 +}
 +
++static void pick_nbnxn_resources(FILE *fp,
++                                 const t_commrec *cr,
++                                 const gmx_hw_info_t *hwinfo,
++                                 gmx_bool bDoNonbonded,
++                                 gmx_bool *bUseGPU,
++                                 gmx_bool *bEmulateGPU)
++{
++    gmx_bool bEmulateGPUEnvVarSet;
++    char gpu_err_str[STRLEN];
++
++    *bUseGPU = FALSE;
++
++    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
++
++    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
++     * GPUs (currently) only handle non-bonded calculations, we will
++     * automatically switch to emulation if non-bonded calculations are
++     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
++     * way to turn off GPU initialization, data movement, and cleanup.
++     *
++     * GPU emulation can be useful to assess the performance one can expect by
++     * adding GPU(s) to the machine. The conditional below allows this even
++     * if mdrun is compiled without GPU acceleration support.
++     * Note that you should freezing the system as otherwise it will explode.
++     */
++    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
++                    (!bDoNonbonded && hwinfo->bCanUseGPU));
++
++    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
++     */
++    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
++    {
++        /* Each PP node will use the intra-node id-th device from the
++         * list of detected/selected GPUs. */
++        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
++        {
++            /* At this point the init should never fail as we made sure that
++             * we have all the GPUs we need. If it still does, we'll bail. */
++            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
++                      cr->nodeid,
++                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
++                      gpu_err_str);
++        }
++
++        /* Here we actually turn on hardware GPU acceleration */
++        *bUseGPU = TRUE;
++    }
++}
++
 +gmx_bool uses_simple_tables(int cutoff_scheme,
 +                            nonbonded_verlet_t *nbv,
 +                            int group)
 +{
 +    gmx_bool bUsesSimpleTables = TRUE;
 +    int grp_index;
 +
 +    switch(cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        bUsesSimpleTables = TRUE;
 +        break;
 +    case ecutsVERLET:
 +        assert(NULL != nbv && NULL != nbv->grp);
 +        grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
 +        bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    return bUsesSimpleTables;
 +}
 +
 +static void init_ewald_f_table(interaction_const_t *ic,
 +                               gmx_bool bUsesSimpleTables,
 +                               real rtab)
 +{
 +    real maxr;
 +
 +    if (bUsesSimpleTables)
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +        
 +        maxr = (rtab>ic->rcoulomb) ? rtab : ic->rcoulomb;
 +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +    }
 +
 +    sfree_aligned(ic->tabq_coul_FDV0);
 +    sfree_aligned(ic->tabq_coul_F);
 +    sfree_aligned(ic->tabq_coul_V);
 +
 +    /* Create the original table data in FDV0 */
 +    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,32);
 +    snew_aligned(ic->tabq_coul_F,ic->tabq_size,32);
 +    snew_aligned(ic->tabq_coul_V,ic->tabq_size,32);
 +    table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
 +                                ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
 +}
 +
 +void init_interaction_const_tables(FILE *fp, 
 +                                   interaction_const_t *ic,
 +                                   gmx_bool bUsesSimpleTables,
 +                                   real rtab)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_ewald_f_table(ic,bUsesSimpleTables,rtab);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale,ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE *fp, 
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec *fr,
 +                            real  rtab)
 +{
 +    interaction_const_t *ic;
 +    gmx_bool bUsesSimpleTables = TRUE;
 +
 +    snew(ic, 1);
 +
 +    /* Just allocate something so we can free it */
 +    snew_aligned(ic->tabq_coul_FDV0,16,32);
 +    snew_aligned(ic->tabq_coul_F,16,32);
 +    snew_aligned(ic->tabq_coul_V,16,32);
 +
 +    ic->rlist       = fr->rlist;
 +    ic->rlistlong   = fr->rlistlong;
 +    
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw,-6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6),ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp,", Coulomb %.3f",ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp,", Ewald %.3e",ic->sh_ewald);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
 +    init_interaction_const_tables(fp,ic,bUsesSimpleTables,rtab);
 +}
 +
 +static void init_nb_verlet(FILE *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec *ir,
 +                           const t_forcerec *fr,
 +                           const t_commrec *cr,
 +                           const char *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int  i;
 +    char *env;
-                               &nbv->bUseGPU,
++    gmx_bool bEmulateGPU, bHybridGPURun = FALSE;
 +
 +    nbnxn_alloc_t *nb_alloc;
 +    nbnxn_free_t  *nb_free;
 +
 +    snew(nbv, 1);
 +
++    pick_nbnxn_resources(fp, cr, fr->hwinfo,
++                         fr->bNonbonded,
++                         &nbv->bUseGPU,
++                         &bEmulateGPU);
++
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
 +        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
-                                   NULL,
++                              nbv->bUseGPU, bEmulateGPU,
 +                              ir,
 +                              &nbv->grp[i].kernel_type,
 +                              &nbv->grp[i].ewald_excl,
 +                              fr->bNonbonded);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt,"gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
-             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_COULOMB;
++                                  FALSE, FALSE,
 +                                  ir,
 +                                  &nbv->grp[i].kernel_type,
 +                                  &nbv->grp[i].ewald_excl,
 +                                  fr->bNonbonded);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
 +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", 
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? & cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat,1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype,fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec *fr,
 +                   t_fcdata   *fcd,
 +                   const t_inputrec *ir,
 +                   const gmx_mtop_t *mtop,
 +                   const t_commrec  *cr,
 +                   matrix     box,
 +                   gmx_bool       bMolEpot,
 +                   const char *tabfn,
 +                   const char *tabafn,
 +                   const char *tabpfn,
 +                   const char *tabbfn,
 +                   const char *nbpu_opt,
 +                   gmx_bool   bNoSolvOpt,
 +                   real       print_force)
 +{
 +    int     i,j,m,natoms,ngrp,negp_pp,negptable,egi,egj;
 +    real    rtab;
 +    char    *env;
 +    double  dbl;
 +    rvec    box_size;
 +    const t_block *cgs;
 +    gmx_bool    bGenericKernelOnly;
 +    gmx_bool    bTab,bSep14tab,bNormalnblists;
 +    t_nblists *nbl;
 +    int     *nm_ind,egp_flags;
 +    
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC,box))
 +    {
 +        gmx_fatal(FARGS,check_box(ir->ePBC,box));
 +    }
 +    
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI)) {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1]) {
 +            gmx_fatal(FARGS,"The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    } else {
 +        fr->n_tpi = 0;
 +    }
 +    
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress) {
 +      fr->adress_type     = ir->adress->type;
 +      fr->adress_const_wf = ir->adress->const_wf;
 +      fr->adress_ex_width = ir->adress->ex_width;
 +      fr->adress_hy_width = ir->adress->hy_width;
 +      fr->adress_icor     = ir->adress->icor;
 +      fr->adress_site     = ir->adress->site;
 +      fr->adress_ex_forcecap = ir->adress->ex_forcecap;
 +      fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +      snew(fr->adress_group_explicit , ir->adress->n_energy_grps);
 +      for (i=0; i< ir->adress->n_energy_grps; i++){
 +          fr->adress_group_explicit[i]= ir->adress->group_explicit[i];
 +      }
 +
 +      fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +      snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +      for (i=0; i< fr->n_adress_tf_grps; i++){
 +          fr->adress_tf_table_index[i]= ir->adress->tf_table_index[i];
 +      }
 +      copy_rvec(ir->adress->refs,fr->adress_refs);
 +    } else {
 +      fr->adress_type = eAdressOff;
 +      fr->adress_do_hybridpairs = FALSE;
 +    }
 +    
 +    /* Copy the user determined parameters */
 +    fr->userint1 = ir->userint1;
 +    fr->userint2 = ir->userint2;
 +    fr->userint3 = ir->userint3;
 +    fr->userint4 = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +    
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +    
 +    /* Free energy */
 +    fr->efep       = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min,6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power   = ir->fepvals->sc_power;
 +    fr->sc_r_power   = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma,6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env,"%lf",&dbl);
 +        fr->sc_sigma6_min = pow(dbl,6);
 +        if (fp)
 +        {
 +            fprintf(fp,"Setting the minimum soft core sigma to %g nm\n",dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr,fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +
 +    /* We now check in the NS code whether a particular combination of interactions
 +     * can be used with water optimization, and disable it if that is not the case.
 +     */
 +
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
 +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +    }
 +
 +    if (bGenericKernelOnly==TRUE)
 +    {
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir,mtop,FALSE,NULL,NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop,F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp,"\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
 +        }
 +    }
++    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com,fr->posres_com);
 +    copy_rvec(ir->posres_comB,fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coulomb_modifier = ir->coulomb_modifier;
 +    fr->vdw_modifier     = ir->vdw_modifier;
 +
 +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
 +    switch(fr->eeltype)
 +    {
 +        case eelCUT:
-     fr->bGB = (ir->implicit_solvent == eisGBSA);
++            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
 +            break;
 +
 +        case eelRF:
 +        case eelGRF:
 +        case eelRF_NEC:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            break;
 +
 +        case eelRF_ZERO:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
 +            break;
 +
 +        case eelSWITCH:
 +        case eelSHIFT:
 +        case eelUSER:
 +        case eelENCADSHIFT:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            break;
 +
 +        case eelPME:
 +        case eelEWALD:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported electrostatic interaction: %s",eel_names[fr->eeltype]);
 +            break;
 +    }
 +
 +    /* Vdw: Translate from mdp settings to kernel format */
 +    switch(fr->vdwtype)
 +    {
 +        case evdwCUT:
 +            if(fr->bBHAM)
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
 +            }
 +            else
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
 +            }
 +            break;
 +
 +        case evdwSWITCH:
 +        case evdwSHIFT:
 +        case evdwUSER:
 +        case evdwENCADSHIFT:
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported vdw interaction: %s",evdw_names[fr->vdwtype]);
 +            break;
 +    }
 +
 +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
 +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
 +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
 +
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype==eelEWALD);
 +    
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
 +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
 +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
 +                           fr->eeltype == eelEWALD ||
 +                           fr->eeltype == eelPME ||
 +                           fr->eeltype == eelRF ||
 +                           fr->eeltype == eelRF_ZERO);
 +
 +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
 +         * going to be faster to tabulate the interaction than calling the generic kernel.
 +         */
 +        if(fr->nbkernel_elec_modifier==eintmodPOTSWITCH && fr->nbkernel_vdw_modifier==eintmodPOTSWITCH)
 +        {
 +            if((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +        else if((fr->nbkernel_elec_modifier==eintmodPOTSHIFT && fr->nbkernel_vdw_modifier==eintmodPOTSHIFT) ||
 +                ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
 +                  fr->nbkernel_elec_modifier==eintmodEXACTCUTOFF &&
 +                  (fr->nbkernel_vdw_modifier==eintmodPOTSWITCH || fr->nbkernel_vdw_modifier==eintmodPOTSHIFT))))
 +        {
 +            if(fr->rcoulomb != fr->rvdw)
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
 +            fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
 +        }
 +
 +        if(fr->bvdwtab==TRUE)
 +        {
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            fr->nbkernel_vdw_modifier    = eintmodNONE;
 +        }
 +        if(fr->bcoultab==TRUE)
 +        {
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            fr->nbkernel_elec_modifier    = eintmodNONE;
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS,"Cut-off scheme %S only supports LJ repulsion power 12",ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +    
 +    /* Tables are used for direct ewald sum */
 +    if(fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +                fprintf(fp,"Will do PME sum in reciprocal space.\n");
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp,"Hockney1988");
 +                please_cite(fp,"Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp,"Essmann95a");
 +            }
 +            
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp,"Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp,"In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff=calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp,"Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +    
 +    /* Electrostatics */
 +    fr->epsilon_r  = ir->epsilon_r;
 +    fr->epsilon_rf = ir->epsilon_rf;
 +    fr->fudgeQQ    = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +    
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +    
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp,mtop,ir,fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for(m=0; (m<DIM); m++)
 +            box_size[m]=box[m][m];
 +        
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +            set_shift_consts(fp,fr->rcoulomb_switch,fr->rcoulomb,box_size,fr);
 +    }
 +    
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop,F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop,F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                      );
 +    
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm,fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +        snew(fr->shift_vec,SHIFTS);
 +    
 +    if (fr->fshift == NULL)
 +        snew(fr->fshift,SHIFTS);
 +    
 +    if (fr->nbfp == NULL) {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams,fr->bBHAM);
 +    }
 +    
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +    
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM) {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +            gmx_fatal(FARGS,"rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch,fr->rvdw);
 +        if (fp)
 +            fprintf(fp,"Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype==eelSWITCH) ? "switched":"shifted",
 +                    fr->rvdw_switch,fr->rvdw);
 +    } 
 +    
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +        gmx_fatal(FARGS,"Switch/shift interaction not supported with Buckingham");
 +    
 +    if (fp)
 +        fprintf(fp,"Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist,fr->rcoulomb,fr->bBHAM ? "BHAM":"LJ",fr->rvdw);
 +    
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp,fr,mtop);
 +    }
 +    
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp,fr,mtop);
 +    }
 +
 +      fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius,fr->ntype);
 +    snew(fr->atype_vol,fr->ntype);
 +    snew(fr->atype_surftens,fr->ntype);
 +    snew(fr->atype_gb_radius,fr->ntype);
 +    snew(fr->atype_S_hct,fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_radius[i] =mtop->atomtypes.radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +    }  
 +      
 +      /* Generate the GB table if needed */
 +      if(fr->bGB)
 +      {
 +#ifdef GMX_DOUBLE
 +              fr->gbtabscale=2000;
 +#else
 +              fr->gbtabscale=500;
 +#endif
 +              
 +              fr->gbtabr=100;
 +              fr->gbtab=make_gb_table(fp,oenv,fr,tabpfn,fr->gbtabscale);
 +
 +        init_gb(&fr->born,cr,fr,ir,mtop,ir->rgbradii,ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr,fr->born,ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    else
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +        calc_rffac(fp,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    
 +    set_chargesum(fp,fr,mtop);
 +    
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +    
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +    
 +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
 +
 +    bSep14tab = ((!bTab || fr->eeltype!=eelCUT || fr->vdwtype!=evdwCUT ||
 +                  fr->bBHAM || fr->bEwald) &&
 +                 (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab) {
 +        bNormalnblists = TRUE;
 +        fr->nnblists = 1;
 +    } else {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for(egi=0; egi<negp_pp; egi++) {
 +            for(egj=egi;  egj<negp_pp; egj++) {
 +                egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL)) {
 +                    if (egp_flags & EGP_TABLE) {
 +                        negptable++;
 +                    } else {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists) {
 +            fr->nnblists = negptable + 1;
 +        } else {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +            snew(fr->gid2nblists,ir->opts.ngener*ir->opts.ngener);
 +    }
 +
 +    if (ir->adress){
 +        fr->nnblists*=2;
 +    }
 +
 +    snew(fr->nblists,fr->nnblists);
 +    
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab) {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists) {
 +            make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[0]);
 +            if (ir->adress){
 +                make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[fr->nnblists/2]);
 +            }
 +            if (!bSep14tab)
 +                fr->tab14 = fr->nblists[0].table_elec_vdw;
 +            m = 1;
 +        } else {
 +            m = 0;
 +        }
 +        if (negptable > 0) {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for(egi=0; egi<negp_pp; egi++) {
 +                for(egj=egi;  egj<negp_pp; egj++) {
 +                    egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL)) {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1) {
 +                            fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
 +                        if (ir->adress){
 +                             make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[fr->nnblists/2+m]);
 +                        }
 +                        m++;
 +                    } else if (fr->nnblists > 1) {
 +                        fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp,oenv,fr,MASTER(cr),tabpfn,rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if(fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */ 
 +        
 +        if (ir->adress->n_tf_grps > 0){
 +            make_adress_tf_tables(fp,oenv,fr,ir,tabfn, mtop, box);
 +
 +        }else{
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp,oenv,fr,tabafn, box);
 +        }
 +    }
 +    
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type==ewtTABLE)
 +    {
 +        make_wall_tables(fp,oenv,ir,tabfn,&mtop->groups,fr);
 +    }
 +    
 +    if (fcd && tabbfn) {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS,F_TABBONDSNC,
 +                                           mtop,tabbfn,"b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES,-1,
 +                                           mtop,tabbfn,"a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS,-1,
 +                                           mtop,tabbfn,"d");
 +    } else {
 +        if (debug)
 +            fprintf(debug,"No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +    }
 +    
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr,"QM/MM calculation requested.\n");
 +    }
 +    
 +    fr->bQMMM      = ir->bQMMM;   
 +    fr->qr         = mk_QMMMrec();
 +    
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr)) {
 +        fr->cginfo = NULL;
 +    } else {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock,fr->cginfo_mb);
 +    }
 +    
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr,ncg_mtop(mtop),ncg_mtop(mtop),
 +                            mtop->natoms,mtop->natoms,mtop->natoms);
 +    }
 +    
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf=0.;
 +    fr->t_wait=0.;
 +    fr->timesteps=0;
 +    
 +    /* Initialize neighbor search */
 +    init_ns(fp,cr,&fr->ns,fr,mtop,box);
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_nonbonded_setup(fp,fr,bGenericKernelOnly);
 +    /*
 +     if (ir->bAdress)
 +        {
 +            gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +        }
 +     */
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr,mtop->groups.grps[egcENER].nr);
 +    
 +    snew(fr->excl_load,fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS,"With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +    }
 +
 +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
 +    init_interaction_const(fp, &fr->ic, fr, rtab);
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp,ir->eDispCorr,fr);
 +    }
 +}
 +
 +#define pr_real(fp,r) fprintf(fp,"%s: %e\n",#r,r)
 +#define pr_int(fp,i)  fprintf((fp),"%s: %d\n",#i,i)
 +#define pr_bool(fp,b) fprintf((fp),"%s: %s\n",#b,bool_names[b])
 +
 +void pr_forcerec(FILE *fp,t_forcerec *fr,t_commrec *cr)
 +{
 +  int i;
 +
 +  pr_real(fp,fr->rlist);
 +  pr_real(fp,fr->rcoulomb);
 +  pr_real(fp,fr->fudgeQQ);
 +  pr_bool(fp,fr->bGrid);
 +  pr_bool(fp,fr->bTwinRange);
 +  /*pr_int(fp,fr->cg0);
 +    pr_int(fp,fr->hcg);*/
 +  for(i=0; i<fr->nnblists; i++)
 +    pr_int(fp,fr->nblists[i].table_elec_vdw.n);
 +  pr_real(fp,fr->rcoulomb_switch);
 +  pr_real(fp,fr->rcoulomb);
 +  
 +  fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top,const t_commrec *cr)
 +{
 +    const int *ind,*a;
 +    int t,i,j,ntot,n,ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for(i=0; i<top->excls.nr; i++)
 +    {
 +        for(j=ind[i]; j<ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n = 0;
 +    i = 0;
 +    for(t=1; t<=fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while(i < top->excls.nr && n < ntarget)
 +        {
 +            for(j=ind[i]; j<ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
 +
index 4a0905cbe3e355f24e9186c616c06993bb2d5540,0000000000000000000000000000000000000000..16dc7f5ad8d082eabd666ef8a5eeae451c85c3c6
mode 100644,000000..100644
--- /dev/null
@@@ -1,2063 -1,0 +1,2061 @@@
-     
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "genborn.h"
 +#include "vec.h"
 +#include "grompp.h"
 +#include "pdbio.h"
 +#include "names.h"
 +#include "physics.h"
 +#include "partdec.h"
 +#include "domdec.h"
 +#include "network.h"
 +#include "gmx_fatal.h"
 +#include "mtop_util.h"
 +#include "pbc.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_X86_SSE2
 +#  ifdef GMX_DOUBLE
 +#    include "genborn_sse2_double.h"
 +#    include "genborn_allvsall_sse2_double.h"
 +#  else
 +#    include "genborn_sse2_single.h"
 +#    include "genborn_allvsall_sse2_single.h"
 +#  endif /* GMX_DOUBLE */
 +#endif /* SSE or AVX present */
 +
 +#include "genborn_allvsall.h"
 +
 +/*#define DISABLE_SSE*/
 +
 +typedef struct {
 +    int shift;
 +    int naj;
 +    int *aj;
 +    int aj_nalloc;
 +} gbtmpnbl_t;
 +
 +typedef struct gbtmpnbls {
 +    int nlist;
 +    gbtmpnbl_t *list;
 +    int list_nalloc;
 +} t_gbtmpnbls;
 +
 +/* This function is exactly the same as the one in bondfree.c. The reason
 + * it is copied here is that the bonded gb-interactions are evaluated
 + * not in calc_bonds, but rather in calc_gb_forces
 + */
 +static int pbc_rvec_sub(const t_pbc *pbc,const rvec xi,const rvec xj,rvec dx)
 +{
 +      if (pbc) {
 +              return pbc_dx_aiuc(pbc,xi,xj,dx);
 +      }
 +      else {
 +              rvec_sub(xi,xj,dx);
 +              return CENTRAL;
 +      }
 +}
 +
 +int init_gb_nblist(int natoms, t_nblist *nl)
 +{
 +    nl->maxnri      = natoms*4;
 +    nl->maxnrj      = 0;
 +    nl->maxlen      = 0;
 +    nl->nri         = 0;
 +    nl->nrj         = 0;
 +    nl->iinr        = NULL;
 +    nl->gid         = NULL;
 +    nl->shift       = NULL;
 +    nl->jindex      = NULL;
 +    nl->jjnr        = NULL;
 +    /*nl->nltype      = nltype;*/
 +    
 +    srenew(nl->iinr,   nl->maxnri);
 +    srenew(nl->gid,    nl->maxnri);
 +    srenew(nl->shift,  nl->maxnri);
 +    srenew(nl->jindex, nl->maxnri+1);
 +    
 +    nl->jindex[0] = 0;
 +    
 +    return 0;
 +}
 +
 +void gb_pd_send(t_commrec *cr, real *send_data, int nr)
 +{
 +#ifdef GMX_MPI    
 +    int i,cur;
 +    int *index,*sendc,*disp;
 +    
 +    snew(sendc,cr->nnodes);
 +    snew(disp,cr->nnodes);
 +    
 +    index = pd_index(cr);
 +    cur   = cr->nodeid;
 +    
 +    /* Setup count/index arrays */
 +    for(i=0;i<cr->nnodes;i++)
 +    {
 +        sendc[i]  = index[i+1]-index[i];
 +        disp[i]   = index[i];    
 +    }
 +    
 +    /* Do communication */
 +    MPI_Gatherv(send_data+index[cur],sendc[cur],GMX_MPI_REAL,send_data,sendc,
 +                disp,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
 +    MPI_Bcast(send_data,nr,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
 +    
 +#endif
 +}
 +
 +
 +int init_gb_plist(t_params *p_list)
 +{
 +    p_list->nr    = 0;
 +    p_list->param = NULL;
 +    
 +    return 0;
 +}
 +
 +
 +
 +int init_gb_still(const t_commrec *cr, t_forcerec  *fr, 
 +                  const t_atomtypes *atype, t_idef *idef, t_atoms *atoms, 
 +                  gmx_genborn_t *born,int natoms)
 +{
 +    
 +    int i,j,i1,i2,k,m,nbond,nang,ia,ib,ic,id,nb,idx,idx2,at;
 +    int iam,ibm;
 +    int at0,at1;
 +    real length,angle;
 +    real r,ri,rj,ri2,ri3,rj2,r2,r3,r4,rk,ratio,term,h,doffset;
 +    real p1,p2,p3,factor,cosine,rab,rbc;
 +    
 +    real *vsol;
 +    real *gp;
 +    
 +    snew(vsol,natoms);
 +    snew(gp,natoms);
 +    snew(born->gpol_still_work,natoms+3);
 +    
 +    if(PAR(cr))
 +    {
 +        if(PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr,&at0,&at1);
 +            
 +            for(i=0;i<natoms;i++)
 +            {
 +                vsol[i] = gp[i] = 0;
 +            }
 +        }
 +        else
 +        {
 +            at0 = 0;
 +            at1 = natoms;
 +        }
 +    }
 +    else
 +    {
 +        at0 = 0;
 +        at1 = natoms;
 +    }
 +    
 +    doffset = born->gb_doffset;
 +    
 +    for(i=0;i<natoms;i++)
 +    {
 +        born->gpol_globalindex[i]=born->vsolv_globalindex[i]=
 +            born->gb_radius_globalindex[i]=0;     
 +    }
 +    
 +    /* Compute atomic solvation volumes for Still method */
 +    for(i=0;i<natoms;i++)
 +    {    
 +        ri=atype->gb_radius[atoms->atom[i].type];
 +        born->gb_radius_globalindex[i] = ri;
 +        r3=ri*ri*ri;
 +        born->vsolv_globalindex[i]=(4*M_PI/3)*r3;        
 +    }
 +
 +    for(j=0;j<idef->il[F_GB12].nr;j+=3)
 +    {
 +        m=idef->il[F_GB12].iatoms[j];
 +        ia=idef->il[F_GB12].iatoms[j+1];
 +        ib=idef->il[F_GB12].iatoms[j+2];
 +        
 +        r=1.01*idef->iparams[m].gb.st;
 +        
 +        ri   = atype->gb_radius[atoms->atom[ia].type];
 +        rj   = atype->gb_radius[atoms->atom[ib].type];
 +        
 +        ri2  = ri*ri;
 +        ri3  = ri2*ri;
 +        rj2  = rj*rj;
 +        
 +        ratio  = (rj2-ri2-r*r)/(2*ri*r);
 +        h      = ri*(1+ratio);
 +        term   = (M_PI/3.0)*h*h*(3.0*ri-h);
 +
 +        if(PARTDECOMP(cr))
 +        {
 +            vsol[ia]+=term;
 +        }
 +        else
 +        {
 +            born->vsolv_globalindex[ia] -= term;
 +        }
 +        
 +        ratio  = (ri2-rj2-r*r)/(2*rj*r);
 +        h      = rj*(1+ratio);
 +        term   = (M_PI/3.0)*h*h*(3.0*rj-h);
 +        
 +        if(PARTDECOMP(cr))
 +        {
 +            vsol[ib]+=term;
 +        }
 +        else
 +        {
 +            born->vsolv_globalindex[ib] -= term;
 +        }        
 +    }
 +    
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(natoms,vsol,cr);
 +        
 +        for(i=0;i<natoms;i++)
 +        {
 +            born->vsolv_globalindex[i]=born->vsolv_globalindex[i]-vsol[i];
 +        }
 +    }
 +  
 +    /* Get the self-, 1-2 and 1-3 polarization energies for analytical Still 
 +       method */
 +    /* Self */
 +    for(j=0;j<natoms;j++)
 +    {
 +        if(born->use_globalindex[j]==1)
 +        {
 +            born->gpol_globalindex[j]=-0.5*ONE_4PI_EPS0/
 +                (atype->gb_radius[atoms->atom[j].type]-doffset+STILL_P1);
 +        }
 +    }
 +    
 +    /* 1-2 */
 +    for(j=0;j<idef->il[F_GB12].nr;j+=3)
 +    {
 +        m=idef->il[F_GB12].iatoms[j];
 +        ia=idef->il[F_GB12].iatoms[j+1];
 +        ib=idef->il[F_GB12].iatoms[j+2];
 +        
 +        r=idef->iparams[m].gb.st;
 +        
 +        r4=r*r*r*r;
 +
 +        if(PARTDECOMP(cr))
 +        {
 +            gp[ia]+=STILL_P2*born->vsolv_globalindex[ib]/r4;
 +            gp[ib]+=STILL_P2*born->vsolv_globalindex[ia]/r4;
 +        }
 +        else
 +        {
 +            born->gpol_globalindex[ia]=born->gpol_globalindex[ia]+
 +                STILL_P2*born->vsolv_globalindex[ib]/r4;
 +            born->gpol_globalindex[ib]=born->gpol_globalindex[ib]+
 +                STILL_P2*born->vsolv_globalindex[ia]/r4;
 +        }
 +    }
 +
 +    /* 1-3 */
 +    for(j=0;j<idef->il[F_GB13].nr;j+=3)
 +    {
 +        m=idef->il[F_GB13].iatoms[j];
 +        ia=idef->il[F_GB13].iatoms[j+1];
 +        ib=idef->il[F_GB13].iatoms[j+2];
 +    
 +        r=idef->iparams[m].gb.st;
 +        r4=r*r*r*r;
 +    
 +        if(PARTDECOMP(cr))
 +        {
 +            gp[ia]+=STILL_P3*born->vsolv[ib]/r4;
 +            gp[ib]+=STILL_P3*born->vsolv[ia]/r4;
 +        }
 +        else
 +        {
 +            born->gpol_globalindex[ia]=born->gpol_globalindex[ia]+
 +                STILL_P3*born->vsolv_globalindex[ib]/r4;
 +            born->gpol_globalindex[ib]=born->gpol_globalindex[ib]+
 +                STILL_P3*born->vsolv_globalindex[ia]/r4;
 +        }        
 +    }
 +    
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(natoms,gp,cr);
 +        
 +        for(i=0;i<natoms;i++)
 +        {
 +            born->gpol_globalindex[i]=born->gpol_globalindex[i]+gp[i];
 +        }    
 +    }
 +    
 +    sfree(vsol);
 +    sfree(gp);
 +        
 +    return 0;
 +}
 +
 +/* Initialize all GB datastructs and compute polarization energies */
 +int init_gb(gmx_genborn_t **p_born,
 +            const t_commrec *cr, t_forcerec *fr, const t_inputrec *ir,
 +            const gmx_mtop_t *mtop, real rgbradii, int gb_algorithm)
 +{
 +    int i,j,m,ai,aj,jj,natoms,nalloc;
 +    real rai,sk,p,doffset;
 +    
 +    t_atoms        atoms;
 +    gmx_genborn_t  *born;
 +    gmx_localtop_t *localtop;
 +
 +    natoms   = mtop->natoms;
 +        
 +    atoms    = gmx_mtop_global_atoms(mtop);
 +    localtop = gmx_mtop_generate_local_top(mtop,ir);
 +    
 +    snew(born,1);
 +    *p_born = born;
 +
 +    born->nr  = natoms;
 +    
 +    snew(born->drobc, natoms);
 +    snew(born->bRad,  natoms);
 +    
 +    /* Allocate memory for the global data arrays */
 +    snew(born->param_globalindex, natoms+3);
 +    snew(born->gpol_globalindex,  natoms+3);
 +    snew(born->vsolv_globalindex, natoms+3);
 +    snew(born->gb_radius_globalindex, natoms+3);
 +    snew(born->use_globalindex,    natoms+3);
 +    
 +    snew(fr->invsqrta, natoms);
 +    snew(fr->dvda,     natoms);
 +    
 +    fr->dadx              = NULL;
 +    fr->dadx_rawptr       = NULL;
 +    fr->nalloc_dadx       = 0;
 +    born->gpol_still_work = NULL;
 +    born->gpol_hct_work   = NULL;
 +    
 +    /* snew(born->asurf,natoms); */
 +    /* snew(born->dasurf,natoms); */
 +
 +    /* Initialize the gb neighbourlist */
 +    init_gb_nblist(natoms,&(fr->gblist));
 +    
 +    /* Do the Vsites exclusions (if any) */
 +    for(i=0;i<natoms;i++)
 +    {
 +        jj = atoms.atom[i].type;
 +        if (mtop->atomtypes.gb_radius[atoms.atom[i].type] > 0)
 +        {
 +            born->use_globalindex[i] = 1;
 +        }
 +        else
 +        {
 +            born->use_globalindex[i] = 0;
 +        }
 +                
 +        /* If we have a Vsite, put vs_globalindex[i]=0 */
 +        if (C6 (fr->nbfp,fr->ntype,jj,jj) == 0 &&
 +            C12(fr->nbfp,fr->ntype,jj,jj) == 0 &&
 +            atoms.atom[i].q == 0)
 +        {
 +            born->use_globalindex[i]=0;
 +        }
 +    }
 +    
 +    /* Copy algorithm parameters from inputrecord to local structure */
 +    born->obc_alpha  = ir->gb_obc_alpha;
 +    born->obc_beta   = ir->gb_obc_beta;
 +    born->obc_gamma  = ir->gb_obc_gamma;
 +    born->gb_doffset = ir->gb_dielectric_offset;
 +    born->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +    born->epsilon_r = ir->epsilon_r;
 +    
 +    doffset = born->gb_doffset;
 +  
 +    /* Set the surface tension */
 +    born->sa_surface_tension = ir->sa_surface_tension;
 +   
 +    /* If Still model, initialise the polarisation energies */
 +    if(gb_algorithm==egbSTILL)    
 +    {
 +        init_gb_still(cr, fr,&(mtop->atomtypes), &(localtop->idef), &atoms, 
 +                      born, natoms);    
 +    }
 +
 +    
 +    /* If HCT/OBC,  precalculate the sk*atype->S_hct factors */
 +    else if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
 +    {
 +        
 +        snew(born->gpol_hct_work, natoms+3);
 +        
 +        for(i=0;i<natoms;i++)
 +        {    
 +            if(born->use_globalindex[i]==1)
 +            {
 +                rai = mtop->atomtypes.gb_radius[atoms.atom[i].type]-doffset; 
 +                sk  = rai * mtop->atomtypes.S_hct[atoms.atom[i].type];
 +                born->param_globalindex[i] = sk;
 +                born->gb_radius_globalindex[i] = rai;
 +            }
 +            else
 +            {
 +                born->param_globalindex[i] = 0;
 +                born->gb_radius_globalindex[i] = 0;
 +            }
 +        }
 +    }
 +        
 +    /* Allocate memory for work arrays for temporary use */
 +    snew(born->work,natoms+4);
 +    snew(born->count,natoms);
 +    snew(born->nblist_work,natoms);
 +    
 +    /* Domain decomposition specific stuff */
 +    born->nalloc = 0;
 +    
 +    return 0;
 +}
 +
 +
 +
 +static int
 +calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
 +                  const t_atomtypes *atype, rvec x[], t_nblist *nl, 
 +                  gmx_genborn_t *born,t_mdatoms *md)
 +{    
 +    int i,k,n,nj0,nj1,ai,aj,type;
 +    int shift;
 +    real shX,shY,shZ;
 +    real gpi,dr,dr2,dr4,idr4,rvdw,ratio,ccf,theta,term,rai,raj;
 +    real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
 +    real rinv,idr2,idr6,vaj,dccf,cosq,sinq,prod,gpi2;
 +    real factor;
 +    real vai, prod_ai, icf4,icf6;
-     
++
 +    factor  = 0.5*ONE_4PI_EPS0;
 +    n       = 0;
-      
-       for(i=0;i<nl->nri;i++ )
++
 +    for(i=0;i<born->nr;i++)
 +    {
 +        born->gpol_still_work[i]=0;
 +    }
-         
++
++    for(i=0;i<nl->nri;i++ )
 +    {
 +        ai      = nl->iinr[i];
-     
++
 +        nj0     = nl->jindex[i];            
 +        nj1     = nl->jindex[i+1];
-                         
-         for(k=nj0;k<nj1;k++)
++
 +        /* Load shifts for this list */
 +        shift   = nl->shift[i];
 +        shX     = fr->shift_vec[shift][0];
 +        shY     = fr->shift_vec[shift][1];
 +        shZ     = fr->shift_vec[shift][2];
 +        
 +        gpi     = 0;
 +        
 +        rai     = top->atomtypes.gb_radius[md->typeA[ai]];
 +        vai     = born->vsolv[ai];
 +        prod_ai = STILL_P4*vai;
 +        
 +        /* Load atom i coordinates, add shift vectors */
 +        ix1     = shX + x[ai][0];
 +        iy1     = shY + x[ai][1];
 +        iz1     = shZ + x[ai][2];
-             if(ratio>STILL_P5INV) 
++
++        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
 +        {
 +            aj    = nl->jjnr[k];
 +            jx1   = x[aj][0];
 +            jy1   = x[aj][1];
 +            jz1   = x[aj][2];
 +            
 +            dx11  = ix1-jx1;
 +            dy11  = iy1-jy1;
 +            dz11  = iz1-jz1;
 +            
 +            dr2   = dx11*dx11+dy11*dy11+dz11*dz11; 
 +            rinv  = gmx_invsqrt(dr2);
 +            idr2  = rinv*rinv;
 +            idr4  = idr2*idr2;
 +            idr6  = idr4*idr2;
 +            
 +            raj = top->atomtypes.gb_radius[md->typeA[aj]];
 +            
 +            rvdw  = rai + raj;
 +            
 +            ratio = dr2 / (rvdw * rvdw);
 +            vaj   = born->vsolv[aj];
 +            
++            if(ratio>STILL_P5INV)
 +            {
 +                ccf=1.0;
 +                dccf=0.0;
 +            }
 +            else
 +            {
 +                theta = ratio*STILL_PIP5;
 +                cosq  = cos(theta);
 +                term  = 0.5*(1.0-cosq);
 +                ccf   = term*term;
 +                sinq  = 1.0 - cosq*cosq;
 +                dccf  = 2.0*term*sinq*gmx_invsqrt(sinq)*theta;
 +            }
 +            
 +            prod          = STILL_P4*vaj;
 +            icf4          = ccf*idr4;
 +            icf6          = (4*ccf-dccf)*idr6;
-               
 +            born->gpol_still_work[aj] += prod_ai*icf4;
 +            gpi             = gpi+prod*icf4;
 +            
 +            /* Save ai->aj and aj->ai chain rule terms */
 +            fr->dadx[n++]   = prod*icf6;
 +            fr->dadx[n++]   = prod_ai*icf6;
 +        }
 +        born->gpol_still_work[ai]+=gpi;
 +    }
 +
 +    /* Parallel summations */
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(natoms, born->gpol_still_work, cr);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_sum_real(cr->dd, born->gpol_still_work);
 +    }
 +      
 +    /* Calculate the radii */
 +      for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 +    {
 +              if(born->use[i] != 0)
 +        {
-         for(k=nj0;k<nj1;k++)
 +            gpi  = born->gpol[i]+born->gpol_still_work[i];
 +            gpi2 = gpi * gpi;
 +            born->bRad[i]   = factor*gmx_invsqrt(gpi2);
 +            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 +              }
 +    }
 +
 +    /* Extra communication required for DD */
 +    if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_spread_real(cr->dd, born->bRad);
 +        dd_atom_spread_real(cr->dd, fr->invsqrta);
 +    }
 +    
 +    return 0;
 +    
 +}
 +    
 +
 +static int 
 +calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_localtop_t *top,
 +                const t_atomtypes *atype, rvec x[], t_nblist *nl, 
 +                gmx_genborn_t *born,t_mdatoms *md)
 +{
 +    int i,k,n,ai,aj,nj0,nj1,at0,at1;
 +    int shift;
 +    real shX,shY,shZ;
 +    real rai,raj,gpi,dr2,dr,sk,sk_ai,sk2,sk2_ai,lij,uij,diff2,tmp,sum_ai;
 +    real rad,min_rad,rinv,rai_inv;
 +    real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
 +    real lij2, uij2, lij3, uij3, t1,t2,t3;
 +    real lij_inv,dlij,duij,sk2_rinv,prod,log_term;
 +    real doffset,raj_inv,dadx_val;
 +    real *gb_radius;
 +    
 +    doffset = born->gb_doffset;
 +    gb_radius = born->gb_radius;
 +
 +    for(i=0;i<born->nr;i++)
 +    {
 +        born->gpol_hct_work[i] = 0;
 +    }
 +    
 +    /* Keep the compiler happy */
 +    n    = 0;
 +    prod = 0;
 +        
 +    for(i=0;i<nl->nri;i++)
 +    {
 +        ai     = nl->iinr[i];
 +            
 +        nj0    = nl->jindex[i];            
 +        nj1    = nl->jindex[i+1];
 +        
 +        /* Load shifts for this list */
 +        shift   = nl->shift[i];
 +        shX     = fr->shift_vec[shift][0];
 +        shY     = fr->shift_vec[shift][1];
 +        shZ     = fr->shift_vec[shift][2];
 +        
 +        rai     = gb_radius[ai];
 +        rai_inv = 1.0/rai;
 +        
 +        sk_ai   = born->param[ai];
 +        sk2_ai  = sk_ai*sk_ai;
 +        
 +        /* Load atom i coordinates, add shift vectors */
 +        ix1     = shX + x[ai][0];
 +        iy1     = shY + x[ai][1];
 +        iz1     = shZ + x[ai][2];
 +        
 +        sum_ai  = 0;
 +        
-         for(k=nj0;k<nj1;k++)
++        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
 +        {
 +            aj    = nl->jjnr[k];
 +            
 +            jx1   = x[aj][0];
 +            jy1   = x[aj][1];
 +            jz1   = x[aj][2];
 +            
 +            dx11  = ix1 - jx1;
 +            dy11  = iy1 - jy1;
 +            dz11  = iz1 - jz1;
 +            
 +            dr2   = dx11*dx11+dy11*dy11+dz11*dz11;
 +            rinv  = gmx_invsqrt(dr2);
 +            dr    = rinv*dr2;
 +            
 +            sk    = born->param[aj];
 +            raj   = gb_radius[aj];
 +            
 +            /* aj -> ai interaction */
 +            if(rai < dr+sk)
 +            {
 +                lij     = 1.0/(dr-sk);
 +                dlij    = 1.0;
 +                
 +                if(rai>dr-sk) 
 +                {
 +                    lij  = rai_inv;
 +                    dlij = 0.0;
 +                }
 +                            
 +                lij2     = lij*lij;
 +                lij3     = lij2*lij;
 +                
 +                uij      = 1.0/(dr+sk);
 +                uij2     = uij*uij;
 +                uij3     = uij2*uij;
 +                
 +                diff2    = uij2-lij2;
 +                
 +                lij_inv  = gmx_invsqrt(lij2);
 +                sk2      = sk*sk;
 +                sk2_rinv = sk2*rinv;
 +                prod     = 0.25*sk2_rinv;
 +                
 +                log_term = log(uij*lij_inv);
 +                
 +                tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term +
 +                    prod*(-diff2);
 +                                
 +                if(rai<sk-dr)
 +                {
 +                    tmp = tmp + 2.0 * (rai_inv-lij);
 +                }
 +                    
 +                t1 = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
 +                t2 = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
 +                t3 = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
 +                
 +                dadx_val = (dlij*t1+t2+t3)*rinv; /* rb2 is moved to chainrule */
 +                /* fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; */ 
 +                /* rb2 is moved to chainrule    */
 +
 +                sum_ai += 0.5*tmp;
 +            }
 +            else
 +            {
 +                dadx_val = 0.0;
 +            }
 +            fr->dadx[n++] = dadx_val;
 +
 +            
 +            /* ai -> aj interaction */
 +            if(raj < dr + sk_ai)
 +            {
 +                lij     = 1.0/(dr-sk_ai);
 +                dlij    = 1.0;
 +                raj_inv = 1.0/raj;
 +                
 +                if(raj>dr-sk_ai)
 +                {
 +                    lij = raj_inv;
 +                    dlij = 0.0;
 +                }
 +                
 +                lij2     = lij  * lij;
 +                lij3     = lij2 * lij;
 +                
 +                uij      = 1.0/(dr+sk_ai);
 +                uij2     = uij  * uij;
 +                uij3     = uij2 * uij;
 +                
 +                diff2    = uij2-lij2;
 +                
 +                lij_inv  = gmx_invsqrt(lij2);
 +                sk2      =  sk2_ai; /* sk2_ai = sk_ai * sk_ai in i loop above */
 +                sk2_rinv = sk2*rinv;
 +                prod     = 0.25 * sk2_rinv;
 +                
 +                /* log_term = table_log(uij*lij_inv,born->log_table,
 +                   LOG_TABLE_ACCURACY); */
 +                log_term = log(uij*lij_inv);
 +                
 +                tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term +
 +                           prod*(-diff2);
 +                
 +                if(raj<sk_ai-dr)
 +                {
 +                    tmp     = tmp + 2.0 * (raj_inv-lij);
 +                }
 +                
 +                /* duij = 1.0 */
 +                t1      = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
 +                t2      = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
 +                t3      = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
 +                
 +                dadx_val = (dlij*t1+t2+t3)*rinv; /* rb2 is moved to chainrule    */
 +                /* fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; */ /* rb2 is moved to chainrule    */
 +                
 +                born->gpol_hct_work[aj] += 0.5*tmp;
 +            }
 +            else
 +            {
 +                dadx_val = 0.0;
 +            }
 +            fr->dadx[n++] = dadx_val;
 +        }
 +        
 +        born->gpol_hct_work[ai] += sum_ai;
 +    }
 +    
 +    /* Parallel summations */
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(natoms, born->gpol_hct_work, cr);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_sum_real(cr->dd, born->gpol_hct_work);
 +    }
 +    
 +    for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 +    {
 +              if(born->use[i] != 0)
 +        {
 +            rai     = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
 +            sum_ai  = 1.0/rai - born->gpol_hct_work[i];
 +            min_rad = rai + doffset;
 +            rad     = 1.0/sum_ai; 
 +            
 +            born->bRad[i]   = rad > min_rad ? rad : min_rad;
 +            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 +        }
 +    }
 +    
 +    /* Extra communication required for DD */
 +    if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_spread_real(cr->dd, born->bRad);
 +        dd_atom_spread_real(cr->dd, fr->invsqrta);
 +    }
 +    
 +    
 +    return 0;
 +}
 +
 +static int 
 +calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
 +                    const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
 +{
 +    int i,k,ai,aj,nj0,nj1,n,at0,at1;
 +    int shift;
 +    real shX,shY,shZ;
 +    real rai,raj,gpi,dr2,dr,sk,sk2,lij,uij,diff2,tmp,sum_ai;
 +    real rad, min_rad,sum_ai2,sum_ai3,tsum,tchain,rinv,rai_inv,lij_inv,rai_inv2;
 +    real log_term,prod,sk2_rinv,sk_ai,sk2_ai;
 +    real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
 +    real lij2,uij2,lij3,uij3,dlij,duij,t1,t2,t3;
 +    real doffset,raj_inv,dadx_val;
 +    real *gb_radius;
 +    
 +    /* Keep the compiler happy */
 +    n    = 0;
 +    prod = 0;
 +    raj  = 0;
 +    
 +    doffset = born->gb_doffset;
 +    gb_radius = born->gb_radius;
 +    
 +    for(i=0;i<born->nr;i++)
 +    {
 +        born->gpol_hct_work[i] = 0;
 +    }
 +    
 +    for(i=0;i<nl->nri;i++)
 +    {
 +        ai      = nl->iinr[i];
 +    
 +        nj0     = nl->jindex[i];
 +        nj1     = nl->jindex[i+1];
 +        
 +        /* Load shifts for this list */
 +        shift   = nl->shift[i];
 +        shX     = fr->shift_vec[shift][0];
 +        shY     = fr->shift_vec[shift][1];
 +        shZ     = fr->shift_vec[shift][2];
 +        
 +        rai      = gb_radius[ai];
 +        rai_inv  = 1.0/rai;
 +        
 +        sk_ai    = born->param[ai];
 +        sk2_ai   = sk_ai*sk_ai;
 +        
 +        /* Load atom i coordinates, add shift vectors */
 +        ix1      = shX + x[ai][0];
 +        iy1      = shY + x[ai][1];
 +        iz1      = shZ + x[ai][2];
 +        
 +        sum_ai   = 0;
 +        
++        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
 +        {
 +            aj    = nl->jjnr[k];
 +            
 +            jx1   = x[aj][0];
 +            jy1   = x[aj][1];
 +            jz1   = x[aj][2];
 +            
 +            dx11  = ix1 - jx1;
 +            dy11  = iy1 - jy1;
 +            dz11  = iz1 - jz1;
 +            
 +            dr2   = dx11*dx11+dy11*dy11+dz11*dz11;
 +            rinv  = gmx_invsqrt(dr2);
 +            dr    = dr2*rinv;
 +        
 +            /* sk is precalculated in init_gb() */
 +            sk    = born->param[aj];
 +            raj   = gb_radius[aj];
 +            
 +            /* aj -> ai interaction */
 +            if(rai < dr+sk)
 +            {
 +                lij       = 1.0/(dr-sk);
 +                dlij      = 1.0; 
 +                                
 +                if(rai>dr-sk)
 +                {
 +                    lij  = rai_inv;
 +                    dlij = 0.0;
 +                }
 +                
 +                uij      = 1.0/(dr+sk);
 +                lij2     = lij  * lij;
 +                lij3     = lij2 * lij;
 +                uij2     = uij  * uij;
 +                uij3     = uij2 * uij;
 +                
 +                diff2    = uij2-lij2;
 +                
 +                lij_inv  = gmx_invsqrt(lij2);
 +                sk2      = sk*sk;
 +                sk2_rinv = sk2*rinv;    
 +                prod     = 0.25*sk2_rinv;
 +                
 +                log_term = log(uij*lij_inv);
 +                
 +                tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
 +                
 +                if(rai < sk-dr)
 +                {
 +                    tmp = tmp + 2.0 * (rai_inv-lij);
 +                }
 +                
 +                /* duij    = 1.0; */
 +                t1      = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr); 
 +                t2      = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr); 
 +                t3      = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv; 
 +                    
 +                dadx_val = (dlij*t1+t2+t3)*rinv; /* rb2 is moved to chainrule    */
 +                
 +                sum_ai += 0.5*tmp;
 +            }
 +            else
 +            {
 +                dadx_val = 0.0;
 +            }
 +            fr->dadx[n++] = dadx_val;
 +          
 +            /* ai -> aj interaction */
 +            if(raj < dr + sk_ai)
 +            {
 +                lij     = 1.0/(dr-sk_ai);
 +                dlij    = 1.0;
 +                raj_inv = 1.0/raj;
 +                
 +                if(raj>dr-sk_ai)
 +                {
 +                    lij = raj_inv;
 +                    dlij = 0.0;
 +                }
 +                
 +                lij2     = lij  * lij;
 +                lij3     = lij2 * lij;
 +                
 +                uij      = 1.0/(dr+sk_ai);
 +                uij2     = uij  * uij;
 +                uij3     = uij2 * uij;
 +                
 +                diff2    = uij2-lij2;
 +                
 +                lij_inv  = gmx_invsqrt(lij2);
 +                sk2      =  sk2_ai; /* sk2_ai = sk_ai * sk_ai in i loop above */
 +                sk2_rinv = sk2*rinv;
 +                prod     = 0.25 * sk2_rinv;
 +                
 +                /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
 +                log_term = log(uij*lij_inv);
 +                
 +                tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
 +                
 +                if(raj<sk_ai-dr)
 +                {
 +                    tmp     = tmp + 2.0 * (raj_inv-lij);
 +                }
 +                
 +                t1      = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
 +                t2      = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
 +                t3      = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
 +                
 +                dadx_val = (dlij*t1+t2+t3)*rinv; /* rb2 is moved to chainrule    */
 +                
 +                born->gpol_hct_work[aj] += 0.5*tmp;
 +                
 +            }
 +            else
 +            {
 +                dadx_val = 0.0;
 +            }
 +            fr->dadx[n++] = dadx_val;
 +
 +        }        
 +        born->gpol_hct_work[ai] += sum_ai;
 +      
 +    }
 +    
 +    /* Parallel summations */
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(natoms, born->gpol_hct_work, cr);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_sum_real(cr->dd, born->gpol_hct_work);
 +    }
 +    
 +    for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 +    {
 +              if(born->use[i] != 0)
 +        {
 +            rai        = top->atomtypes.gb_radius[md->typeA[i]];
 +            rai_inv2   = 1.0/rai;
 +            rai        = rai-doffset; 
 +            rai_inv    = 1.0/rai;
 +            sum_ai     = rai * born->gpol_hct_work[i];
 +            sum_ai2    = sum_ai  * sum_ai;
 +            sum_ai3    = sum_ai2 * sum_ai;
 +            
 +            tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
 +            born->bRad[i] = rai_inv - tsum*rai_inv2;
 +            born->bRad[i] = 1.0 / born->bRad[i];
 +            
 +            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 +            
 +            tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
 +            born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
 +        }
 +    }
 +    
 +    /* Extra (local) communication required for DD */
 +    if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_spread_real(cr->dd, born->bRad);
 +        dd_atom_spread_real(cr->dd, fr->invsqrta);
 +        dd_atom_spread_real(cr->dd, born->drobc);
 +    }
 +    
 +    return 0;
 +    
 +}
 +
 +
 +
 +int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,gmx_localtop_t *top,
 +                const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,t_nrnb     *nrnb)
 +{    
 +    real *p;
 +    int   cnt;
 +    int ndadx;
-             calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); 
++    
 +    if(fr->bAllvsAll && fr->dadx==NULL)
 +    {
 +        /* We might need up to 8 atoms of padding before and after, 
 +         * and another 4 units to guarantee SSE alignment.
 +         */
 +        fr->nalloc_dadx = 2*(md->homenr+12)*(md->nr/2+1+12);
 +        snew(fr->dadx_rawptr,fr->nalloc_dadx);
 +        fr->dadx = (real *) (((size_t) fr->dadx_rawptr + 16) & (~((size_t) 15)));
 +    }
 +    else
 +    {
 +        /* In the SSE-enabled gb-loops, when writing to dadx, we
 +         * always write 2*4 elements at a time, even in the case with only
 +         * 1-3 j particles, where we only really need to write 2*(1-3)
 +         * elements. This is because we want dadx to be aligned to a 16-
 +         * byte boundary, and being able to use _mm_store/load_ps
 +         */
 +        ndadx = 2 * (nl->nrj + 3*nl->nri);
 +
 +        /* First, reallocate the dadx array, we need 3 extra for SSE */
 +        if (ndadx + 3 > fr->nalloc_dadx)
 +        {
 +            fr->nalloc_dadx = over_alloc_large(ndadx) + 3;
 +            srenew(fr->dadx_rawptr,fr->nalloc_dadx);
 +            fr->dadx = (real *) (((size_t) fr->dadx_rawptr + 16) & (~((size_t) 15)));            
 +        }
 +    }
 +
 +    if(fr->bAllvsAll)
 +    {
 +        cnt = md->homenr*(md->nr/2+1);
 +        
 +        if(ir->gb_algorithm==egbSTILL)
 +        {
 +#if 0 && defined (GMX_X86_SSE2)
 +            if(fr->use_acceleration)
 +            {
 +#  ifdef GMX_DOUBLE
 +                genborn_allvsall_calc_still_radii_sse2_double(fr,md,born,top,x[0],cr,&fr->AllvsAll_workgb);
 +#  else
 +                genborn_allvsall_calc_still_radii_sse2_single(fr,md,born,top,x[0],cr,&fr->AllvsAll_workgb);
 +#  endif
 +            }
 +            else
 +            {
 +                genborn_allvsall_calc_still_radii(fr,md,born,top,x[0],cr,&fr->AllvsAll_workgb);
 +            }
 +#else
 +            genborn_allvsall_calc_still_radii(fr,md,born,top,x[0],cr,&fr->AllvsAll_workgb);
 +#endif
 +            /* 13 flops in outer loop, 47 flops in inner loop */
 +            inc_nrnb(nrnb,eNR_BORN_AVA_RADII_STILL,md->homenr*13+cnt*47);
 +        }
 +        else if(ir->gb_algorithm==egbHCT || ir->gb_algorithm==egbOBC)
 +        {
 +#if 0 && defined (GMX_X86_SSE2)
 +            if(fr->use_acceleration)
 +            {
 +#  ifdef GMX_DOUBLE
 +                genborn_allvsall_calc_hct_obc_radii_sse2_double(fr,md,born,ir->gb_algorithm,top,x[0],cr,&fr->AllvsAll_workgb);
 +#  else
 +                genborn_allvsall_calc_hct_obc_radii_sse2_single(fr,md,born,ir->gb_algorithm,top,x[0],cr,&fr->AllvsAll_workgb);
 +#  endif
 +            }
 +            else
 +            {
 +                genborn_allvsall_calc_hct_obc_radii(fr,md,born,ir->gb_algorithm,top,x[0],cr,&fr->AllvsAll_workgb);
 +            }
 +#else
 +            genborn_allvsall_calc_hct_obc_radii(fr,md,born,ir->gb_algorithm,top,x[0],cr,&fr->AllvsAll_workgb);
 +#endif
 +            /* 24 flops in outer loop, 183 in inner */
 +            inc_nrnb(nrnb,eNR_BORN_AVA_RADII_HCT_OBC,md->homenr*24+cnt*183);
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS,"Bad gb algorithm for all-vs-all interactions");
 +        }
 +        return 0;
 +    }
 +    
 +    /* Switch for determining which algorithm to use for Born radii calculation */
 +#ifdef GMX_DOUBLE
 +    
 +#if 0 && defined (GMX_X86_SSE2)
 +    /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
 +    switch(ir->gb_algorithm)
 +    {
 +        case egbSTILL:
 +            if(fr->use_acceleration)
 +            {            
 +                calc_gb_rad_still_sse2_double(cr,fr,born->nr,top, atype, x[0], nl, born);
 +            }
 +            else
 +            {
 +                calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            }   
 +            break;
 +        case egbHCT:
 +            if(fr->use_acceleration)
 +            {
 +                calc_gb_rad_hct_obc_sse2_double(cr,fr,born->nr,top, atype, x[0], nl, born, md, ir->gb_algorithm);
 +            }
 +            else
 +            {
 +                calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            }
 +            break;
 +        case egbOBC:
 +            if(fr->use_acceleration)
 +            {
 +                calc_gb_rad_hct_obc_sse2_double(cr,fr,born->nr,top, atype, x[0], nl, born, md, ir->gb_algorithm);
 +            }
 +            else
 +            {
 +                calc_gb_rad_obc(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            }
 +            break;
 +            
 +        default:
 +            gmx_fatal(FARGS, "Unknown double precision sse-enabled algorithm for Born radii calculation: %d",ir->gb_algorithm);
 +    }
 +#else
 +    switch(ir->gb_algorithm)
 +    {
 +        case egbSTILL:
 +            calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            break;
 +        case egbHCT:
 +            calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            break;
 +        case egbOBC:
 +            calc_gb_rad_obc(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            break;
 +            
 +        default:
 +            gmx_fatal(FARGS, "Unknown double precision algorithm for Born radii calculation: %d",ir->gb_algorithm);
 +    }
 +            
 +#endif
 +                        
 +#else                
 +            
 +#if 0 && defined (GMX_X86_SSE2)
 +    /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
 +    switch(ir->gb_algorithm)
 +    {
 +        case egbSTILL:
 +            if(fr->use_acceleration)
 +            {
 +            calc_gb_rad_still_sse2_single(cr,fr,born->nr,top, atype, x[0], nl, born);
 +            }
 +            else
 +            {
 +                calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            }
 +            break;
 +        case egbHCT:
 +                if(fr->use_acceleration)
 +                {
 +                    calc_gb_rad_hct_obc_sse2_single(cr,fr,born->nr,top, atype, x[0], nl, born, md, ir->gb_algorithm);
 +                }
 +                else
 +                {
 +                    calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +                }
 +            break;
 +            
 +        case egbOBC:
 +            if(fr->use_acceleration)
 +            {
 +                calc_gb_rad_hct_obc_sse2_single(cr,fr,born->nr,top, atype, x[0], nl, born, md, ir->gb_algorithm);
 +            }
 +            else
 +            {
 +                calc_gb_rad_obc(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            }
 +            break;
 +            
 +        default:
 +            gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d",ir->gb_algorithm);
 +    }
 +    
 +#else
 +    switch(ir->gb_algorithm)
 +    {
 +        case egbSTILL:
-         for(k=nj0;k<nj1;k++)
++            calc_gb_rad_still(cr,fr,born->nr,top,atype,x,nl,born,md);
 +            break;
 +        case egbHCT:
 +            calc_gb_rad_hct(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            break;
 +        case egbOBC:
 +            calc_gb_rad_obc(cr,fr,born->nr,top,atype,x,nl,born,md); 
 +            break;
 +            
 +        default:
 +            gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d",ir->gb_algorithm);
 +    }
 +    
 +#endif /* Single precision sse */
 +            
 +#endif /* Double or single precision */
 +    
 +    if(fr->bAllvsAll==FALSE)
 +    {
 +        switch(ir->gb_algorithm)
 +        {
 +            case egbSTILL:
 +                /* 17 flops per outer loop iteration, 47 flops per inner loop */
 +                inc_nrnb(nrnb,eNR_BORN_RADII_STILL,nl->nri*17+nl->nrj*47);
 +                break;
 +            case egbHCT:
 +            case egbOBC:
 +                /* 61 (assuming 10 for tanh) flops for outer loop iteration, 183 flops per inner loop */
 +                inc_nrnb(nrnb,eNR_BORN_RADII_HCT_OBC,nl->nri*61+nl->nrj*183);
 +                break;
 +                
 +            default:
 +                break;
 +        }
 +    }
 +    
 +    return 0;        
 +}
 +
 +
 +
 +real gb_bonds_tab(rvec x[], rvec f[], rvec fshift[], real *charge, real *p_gbtabscale,
 +                  real *invsqrta, real *dvda, real *GBtab, t_idef *idef, real epsilon_r,
 +                  real gb_epsilon_solvent, real facel, const t_pbc *pbc, const t_graph *graph)
 +{
 +    int i,j,n0,m,nnn,type,ai,aj;
 +      int ki;
 + 
 +      real isai,isaj;
 +    real r,rsq11;
 +    real rinv11,iq;
 +    real isaprod,qq,gbscale,gbtabscale,Y,F,Geps,Heps2,Fp,VV,FF,rt,eps,eps2;
 +    real vgb,fgb,vcoul,fijC,dvdatmp,fscal,dvdaj;
 +    real vctot;
 +    
 +      rvec dx;
 +      ivec dt;
 +      
 +    t_iatom *forceatoms;
 +
 +    /* Scale the electrostatics by gb_epsilon_solvent */
 +    facel = facel * ((1.0/epsilon_r) - 1.0/gb_epsilon_solvent);
 +    
 +    gbtabscale=*p_gbtabscale;
 +    vctot = 0.0;
 +    
 +    for(j=F_GB12;j<=F_GB14;j++)
 +    {
 +        forceatoms = idef->il[j].iatoms;
 +        
 +        for(i=0;i<idef->il[j].nr; )
 +        {
 +            /* To avoid reading in the interaction type, we just increment i to pass over
 +             * the types in the forceatoms array, this saves some memory accesses
 +             */
 +            i++;
 +            ai            = forceatoms[i++];
 +            aj            = forceatoms[i++];
 +                      
 +                      ki            = pbc_rvec_sub(pbc,x[ai],x[aj],dx);
 +                      rsq11         = iprod(dx,dx);
 +                      
 +                      isai          = invsqrta[ai];
 +                      iq            = (-1)*facel*charge[ai];
 +                      
 +            rinv11        = gmx_invsqrt(rsq11);
 +            isaj          = invsqrta[aj];
 +            isaprod       = isai*isaj;
 +            qq            = isaprod*iq*charge[aj];
 +            gbscale       = isaprod*gbtabscale;
 +            r             = rsq11*rinv11;
 +            rt            = r*gbscale;
 +            n0            = rt;
 +            eps           = rt-n0;
 +            eps2          = eps*eps;
 +            nnn           = 4*n0;
 +            Y             = GBtab[nnn];
 +            F             = GBtab[nnn+1];
 +            Geps          = eps*GBtab[nnn+2];
 +            Heps2         = eps2*GBtab[nnn+3];
 +            Fp            = F+Geps+Heps2;
 +            VV            = Y+eps*Fp;
 +            FF            = Fp+Geps+2.0*Heps2;
 +            vgb           = qq*VV;
 +            fijC          = qq*FF*gbscale;
 +            dvdatmp       = -(vgb+fijC*r)*0.5;
 +            dvda[aj]      = dvda[aj] + dvdatmp*isaj*isaj;
 +            dvda[ai]      = dvda[ai] + dvdatmp*isai*isai;
 +            vctot         = vctot + vgb;
 +            fgb           = -(fijC)*rinv11;
 +                      
 +                      if (graph) {
 +                              ivec_sub(SHIFT_IVEC(graph,ai),SHIFT_IVEC(graph,aj),dt);
 +                              ki=IVEC2IS(dt);
 +                      }
 +                      
 +                      for (m=0; (m<DIM); m++) {                       /*  15          */
 +                              fscal=fgb*dx[m];
 +                              f[ai][m]+=fscal;
 +                              f[aj][m]-=fscal;
 +                              fshift[ki][m]+=fscal;
 +                              fshift[CENTRAL][m]-=fscal;
 +                      }
 +        }
 +    }
 +    
 +    return vctot;
 +}
 +
 +real calc_gb_selfcorrections(t_commrec *cr, int natoms, 
 +                 real *charge, gmx_genborn_t *born, real *dvda, t_mdatoms *md, double facel)
 +{    
 +    int i,ai,at0,at1;
 +    real rai,e,derb,q,q2,fi,rai_inv,vtot;
 +
 +    if(PARTDECOMP(cr))
 +    {
 +        pd_at_range(cr,&at0,&at1);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        at0=0;
 +        at1=cr->dd->nat_home;
 +    }
 +    else
 +    {
 +        at0=0;
 +        at1=natoms;
 +        
 +    }
 +        
 +    /* Scale the electrostatics by gb_epsilon_solvent */
 +    facel = facel * ((1.0/born->epsilon_r) - 1.0/born->gb_epsilon_solvent);
 +    
 +    vtot=0.0;
 +    
 +    /* Apply self corrections */
 +    for(i=at0;i<at1;i++)
 +    {
 +        ai       = i;
 +        
 +        if(born->use[ai]==1)
 +        {
 +            rai      = born->bRad[ai];
 +            rai_inv  = 1.0/rai;
 +            q        = charge[ai];
 +            q2       = q*q;
 +            fi       = facel*q2;
 +            e        = fi*rai_inv;
 +            derb     = 0.5*e*rai_inv*rai_inv;
 +            dvda[ai] += derb*rai;
 +            vtot     -= 0.5*e;
 +        }
 +    }
 +    
 +   return vtot;    
 +    
 +}
 +
 +real calc_gb_nonpolar(t_commrec *cr, t_forcerec *fr,int natoms,gmx_genborn_t *born, gmx_localtop_t *top, 
 +                      const t_atomtypes *atype, real *dvda,int gb_algorithm, t_mdatoms *md)
 +{
 +    int ai,i,at0,at1;
 +    real e,es,rai,rbi,term,probe,tmp,factor;
 +    real rbi_inv,rbi_inv2;
 +    
 +    /* To keep the compiler happy */
 +    factor=0;
 +    
 +    if(PARTDECOMP(cr))
 +    {
 +        pd_at_range(cr,&at0,&at1);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        at0 = 0;
 +        at1 = cr->dd->nat_home;
 +    }
 +    else
 +    {
 +        at0=0;
 +        at1=natoms;
 +    }
 +    
 +  /* factor is the surface tension */
 +  factor = born->sa_surface_tension;
 +  /*
 +  
 +    // The surface tension factor is 0.0049 for Still model, 0.0054 for HCT/OBC
 +    if(gb_algorithm==egbSTILL)
 +    {
 +        factor=0.0049*100*CAL2JOULE;
 +    }
 +    else    
 +    {
 +        factor=0.0054*100*CAL2JOULE;    
 +    }
 +    */
 +    /* if(gb_algorithm==egbHCT || gb_algorithm==egbOBC) */
 +    
 +    es    = 0;
 +    probe = 0.14;
 +    term  = M_PI*4;
 +    
 +    for(i=at0;i<at1;i++)
 +    {
 +        ai        = i;
 +        
 +        if(born->use[ai]==1)
 +        {
 +            rai       = top->atomtypes.gb_radius[md->typeA[ai]];
 +            rbi_inv   = fr->invsqrta[ai];
 +            rbi_inv2  = rbi_inv * rbi_inv;
 +            tmp       = (rai*rbi_inv2)*(rai*rbi_inv2);
 +            tmp       = tmp*tmp*tmp;
 +            e         = factor*term*(rai+probe)*(rai+probe)*tmp;
 +            dvda[ai]  = dvda[ai] - 6*e*rbi_inv2;    
 +            es        = es + e;
 +        }
 +    }    
 +
 +    return es;
 +}
 +
 +
 +
 +real calc_gb_chainrule(int natoms, t_nblist *nl, real *dadx, real *dvda, rvec x[], rvec t[], rvec fshift[], 
 +                       rvec shift_vec[], int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 +{    
 +    int i,k,n,ai,aj,nj0,nj1,n0,n1;
 +    int shift;
 +    real shX,shY,shZ;
 +    real fgb,fij,rb2,rbi,fix1,fiy1,fiz1;
 +    real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11,rsq11;
 +    real rinv11,tx,ty,tz,rbai,rbaj,fgb_ai;
 +    real *rb;
 +    volatile int idx;
 +        
 +    n  = 0;    
 +    rb = born->work;
 +        
 +  n0 = 0;
 +  n1 = natoms;
 +  
 +    if(gb_algorithm==egbSTILL) 
 +    {
 +        for(i=n0;i<n1;i++)
 +        {
 +          rbi   = born->bRad[i];
 +          rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 +        }
 +    }
 +    else if(gb_algorithm==egbHCT) 
 +    {
 +        for(i=n0;i<n1;i++)
 +        {
 +          rbi   = born->bRad[i];
 +          rb[i] = rbi * rbi * dvda[i];
 +        }
 +    }
 +    else if(gb_algorithm==egbOBC) 
 +    {
 +        for(i=n0;i<n1;i++)
 +        {
 +          rbi   = born->bRad[i];
 +          rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 +        }
 +    }
 +    
 +    for(i=0;i<nl->nri;i++)
 +    {
 +        ai   = nl->iinr[i];
 +        
 +        nj0  = nl->jindex[i];
 +        nj1  = nl->jindex[i+1];
 +        
 +        /* Load shifts for this list */
 +        shift   = nl->shift[i];
 +        shX     = shift_vec[shift][0];
 +        shY     = shift_vec[shift][1];
 +        shZ     = shift_vec[shift][2];
 +        
 +        /* Load atom i coordinates, add shift vectors */
 +        ix1  = shX + x[ai][0];
 +        iy1  = shY + x[ai][1];
 +        iz1  = shZ + x[ai][2];
 +        
 +        fix1 = 0;
 +        fiy1 = 0;
 +        fiz1 = 0;
 +        
 +        rbai = rb[ai];
 +        
++        for(k=nj0;k<nj1 && nl->jjnr[k]>=0;k++)
 +        {
 +            aj = nl->jjnr[k];
 +            
 +            jx1     = x[aj][0];
 +            jy1     = x[aj][1];
 +            jz1     = x[aj][2];
 +            
 +            dx11    = ix1 - jx1;
 +            dy11    = iy1 - jy1;
 +            dz11    = iz1 - jz1;
 +            
 +            rbaj    = rb[aj];
 +            
 +            fgb     = rbai*dadx[n++]; 
 +            fgb_ai  = rbaj*dadx[n++];
 +            
 +            /* Total force between ai and aj is the sum of ai->aj and aj->ai */
 +            fgb     = fgb + fgb_ai;
 +            
 +            tx      = fgb * dx11;
 +            ty      = fgb * dy11;
 +            tz      = fgb * dz11;
 +                        
 +            fix1    = fix1 + tx;
 +            fiy1    = fiy1 + ty;
 +            fiz1    = fiz1 + tz;
 +            
 +            /* Update force on atom aj */
 +            t[aj][0] = t[aj][0] - tx;
 +            t[aj][1] = t[aj][1] - ty;
 +            t[aj][2] = t[aj][2] - tz;
 +        }
 +                
 +        /* Update force and shift forces on atom ai */
 +        t[ai][0] = t[ai][0] + fix1;
 +        t[ai][1] = t[ai][1] + fiy1;
 +        t[ai][2] = t[ai][2] + fiz1;
 +        
 +        fshift[shift][0] = fshift[shift][0] + fix1;
 +        fshift[shift][1] = fshift[shift][1] + fiy1;
 +        fshift[shift][2] = fshift[shift][2] + fiz1;
 +        
 +    }
 +
 +    return 0;    
 +}
 +
 +
 +void
 +calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t *top, const t_atomtypes *atype, 
 +               rvec x[], rvec f[], t_forcerec *fr, t_idef *idef, int gb_algorithm, int sa_algorithm, t_nrnb *nrnb, gmx_bool bRad,
 +               const t_pbc *pbc, const t_graph *graph, gmx_enerdata_t *enerd)
 +{
 +    real v=0;
 +    int  cnt;
 +    int i;
 +    
 +      /* PBC or not? */
 +      const t_pbc *pbc_null;
 +      
 +      if (fr->bMolPBC)
 +              pbc_null = pbc;
 +      else
 +              pbc_null = NULL;
 +  
 +  if(sa_algorithm == esaAPPROX)
 +  {
 +    /* Do a simple ACE type approximation for the non-polar solvation */
 +    enerd->term[F_NPSOLVATION] += calc_gb_nonpolar(cr, fr,born->nr, born, top, atype, fr->dvda, gb_algorithm,md);
 +  }
 +  
 +  /* Calculate the bonded GB-interactions using either table or analytical formula */
 +    enerd->term[F_GBPOL]       += gb_bonds_tab(x,f,fr->fshift, md->chargeA,&(fr->gbtabscale),
 +                                     fr->invsqrta,fr->dvda,fr->gbtab.data,idef,born->epsilon_r,born->gb_epsilon_solvent, fr->epsfac, pbc_null, graph);
 +    
 +    /* Calculate self corrections to the GB energies - currently only A state used! (FIXME) */
 +    enerd->term[F_GBPOL]       += calc_gb_selfcorrections(cr,born->nr,md->chargeA, born, fr->dvda, md, fr->epsfac);         
 +
 +    /* If parallel, sum the derivative of the potential w.r.t the born radii */
 +    if(PARTDECOMP(cr))
 +    {
 +        gmx_sum(md->nr,fr->dvda, cr);
 +    }
 +    else if(DOMAINDECOMP(cr))
 +    {
 +        dd_atom_sum_real(cr->dd,fr->dvda);
 +        dd_atom_spread_real(cr->dd,fr->dvda);
 +    }
 +
 +    if(fr->bAllvsAll)
 +    {
 +#if 0 && defined (GMX_X86_SSE2)
 +        if(fr->use_acceleration)
 +        {
 +#  ifdef GMX_DOUBLE
 +            genborn_allvsall_calc_chainrule_sse2_double(fr,md,born,x[0],f[0],gb_algorithm,fr->AllvsAll_workgb);
 +#  else
 +            genborn_allvsall_calc_chainrule_sse2_single(fr,md,born,x[0],f[0],gb_algorithm,fr->AllvsAll_workgb);
 +#  endif
 +        }
 +        else
 +        {
 +            genborn_allvsall_calc_chainrule(fr,md,born,x[0],f[0],gb_algorithm,fr->AllvsAll_workgb);
 +        }
 +#else
 +        genborn_allvsall_calc_chainrule(fr,md,born,x[0],f[0],gb_algorithm,fr->AllvsAll_workgb);
 +#endif
 +        cnt = md->homenr*(md->nr/2+1);
 +        /* 9 flops for outer loop, 15 for inner */
 +        inc_nrnb(nrnb,eNR_BORN_AVA_CHAINRULE,md->homenr*9+cnt*15);
 +        return;
 +    }
 +    
 +#if 0 && defined (GMX_X86_SSE2)
 +    if(fr->use_acceleration)
 +    {
 +#  ifdef GMX_DOUBLE
 +        calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist),fr->dadx,fr->dvda,x[0], 
 +                                      f[0],fr->fshift[0],fr->shift_vec[0],gb_algorithm,born,md);
 +#  else
 +        calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist),fr->dadx,fr->dvda,x[0], 
 +                                      f[0],fr->fshift[0],fr->shift_vec[0],gb_algorithm,born,md);
 +#  endif
 +    }
 +    else
 +    {
 +        calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
 +                          x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
 +    }
 +#else
 +    calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
 +                      x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
 +#endif
 +
 +    if(!fr->bAllvsAll)
 +    {
 +        /* 9 flops for outer loop, 15 for inner */
 +        inc_nrnb(nrnb,eNR_BORN_CHAINRULE,fr->gblist.nri*9+fr->gblist.nrj*15);
 +    }
 +}
 +
 +static void add_j_to_gblist(gbtmpnbl_t *list,int aj)
 +{
 +    if (list->naj >= list->aj_nalloc)
 +    {
 +        list->aj_nalloc = over_alloc_large(list->naj+1);
 +        srenew(list->aj,list->aj_nalloc);
 +    }
 +
 +    list->aj[list->naj++] = aj;
 +}
 +
 +static gbtmpnbl_t *find_gbtmplist(struct gbtmpnbls *lists,int shift)
 +{
 +    int ind,i;
 +
 +    /* Search the list with the same shift, if there is one */
 +    ind = 0;
 +    while (ind < lists->nlist && shift != lists->list[ind].shift)
 +    {
 +        ind++;
 +    }
 +    if (ind == lists->nlist)
 +    {
 +        if (lists->nlist == lists->list_nalloc)
 +        {
 +            lists->list_nalloc++;
 +            srenew(lists->list,lists->list_nalloc);
 +            for(i=lists->nlist; i<lists->list_nalloc; i++)
 +            {
 +                lists->list[i].aj        = NULL;
 +                lists->list[i].aj_nalloc = 0;
 +            }
 +
 +        }
 +        
 +        lists->list[lists->nlist].shift = shift;
 +        lists->list[lists->nlist].naj   = 0;
 +        lists->nlist++;
 +    }
 +
 +    return &lists->list[ind];
 +}
 +
 +static void add_bondeds_to_gblist(t_ilist *il,
 +                                  gmx_bool bMolPBC,t_pbc *pbc,t_graph *g,rvec *x,
 +                                  struct gbtmpnbls *nls)
 +{
 +    int  ind,j,ai,aj,shift,found;
 +    rvec dx;
 +    ivec dt;
 +    gbtmpnbl_t *list;
 +
 +    shift = CENTRAL;
 +    for(ind=0; ind<il->nr; ind+=3)
 +    {
 +        ai = il->iatoms[ind+1];
 +        aj = il->iatoms[ind+2];
 +                
 +        shift = CENTRAL;
 +        if (g != NULL)
 +        {
 +          rvec_sub(x[ai],x[aj],dx);
 +          ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +          shift = IVEC2IS(dt);
 +        }
 +        else if (bMolPBC)
 +        {
 +          shift = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +        }
 +
 +        /* Find the list for this shift or create one */
 +        list = find_gbtmplist(&nls[ai],shift);
 +        
 +        found=0;
 +        
 +        /* So that we do not add the same bond twice.
 +         * This happens with some constraints between 1-3 atoms
 +         * that are in the bond-list but should not be in the GB nb-list */
 +        for(j=0;j<list->naj;j++)
 +        {
 +            if (list->aj[j] == aj)
 +            {
 +                found = 1;
 +            }
 +        }    
 +        
 +        if (found == 0)
 +        {
 +                      if(ai == aj)
 +                      {
 +                              gmx_incons("ai == aj");
 +                      }
 +                      
 +            add_j_to_gblist(list,aj);
 +        }
 +    }
 +}
 +
 +static int
 +compare_int (const void * a, const void * b)
 +{
 +    return ( *(int*)a - *(int*)b );
 +}
 +
 +
 +
 +int make_gb_nblist(t_commrec *cr, int gb_algorithm, real gbcut,
 +                   rvec x[], matrix box,
 +                   t_forcerec *fr, t_idef *idef, t_graph *graph, gmx_genborn_t *born)
 +{
 +    int i,l,ii,j,k,n,nj0,nj1,ai,aj,at0,at1,found,shift,s;
 +    int apa;
 +    t_nblist *nblist;
 +    t_pbc pbc;
 +    
 +    struct gbtmpnbls *nls;
 +    gbtmpnbl_t *list =NULL;
 +    
 +    set_pbc(&pbc,fr->ePBC,box);
 +    nls   = born->nblist_work;
 +    
 +    for(i=0;i<born->nr;i++)
 +    {
 +        nls[i].nlist = 0;
 +    }
 +
 +    if (fr->bMolPBC)
 +    {
 +        set_pbc_dd(&pbc,fr->ePBC,cr->dd,TRUE,box);
 +    }
 +
 +    switch (gb_algorithm)
 +    {
 +    case egbHCT:
 +    case egbOBC:
 +        /* Loop over 1-2, 1-3 and 1-4 interactions */
 +        for(j=F_GB12;j<=F_GB14;j++)
 +        {
 +            add_bondeds_to_gblist(&idef->il[j],fr->bMolPBC,&pbc,graph,x,nls);
 +        }
 +        break;
 +    case egbSTILL:
 +        /* Loop over 1-4 interactions */
 +        add_bondeds_to_gblist(&idef->il[F_GB14],fr->bMolPBC,&pbc,graph,x,nls);
 +        break;
 +    default:
 +        gmx_incons("Unknown GB algorithm");
 +    }
 +    
 +    /* Loop over the VDWQQ and VDW nblists to set up the nonbonded part of the GB list */
 +    for(n=0; (n<fr->nnblists); n++)
 +    {
 +        for(i=0; (i<eNL_NR); i++)
 +        {
 +            nblist=&(fr->nblists[n].nlist_sr[i]);
 +            
 +            if (nblist->nri > 0 && (i==eNL_VDWQQ || i==eNL_QQ))
 +            {
 +                for(j=0;j<nblist->nri;j++)
 +                {
 +                    ai    = nblist->iinr[j];
 +                    shift = nblist->shift[j];
 +
 +                    /* Find the list for this shift or create one */
 +                    list = find_gbtmplist(&nls[ai],shift);
 +
 +                    nj0 = nblist->jindex[j];
 +                    nj1 = nblist->jindex[j+1];
 +                    
 +                    /* Add all the j-atoms in the non-bonded list to the GB list */
 +                    for(k=nj0;k<nj1;k++)
 +                    {
 +                        add_j_to_gblist(list,nblist->jjnr[k]);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +        
 +    /* Zero out some counters */
 +      fr->gblist.nri=0;
 +    fr->gblist.nrj=0;
 +    
 +      fr->gblist.jindex[0] = fr->gblist.nri;
 +      
 +      for(i=0;i<fr->natoms_force;i++)
 +    {
 +        for(s=0; s<nls[i].nlist; s++)
 +        {
 +            list = &nls[i].list[s];
 +
 +            /* Only add those atoms that actually have neighbours */
 +            if (born->use[i] != 0)
 +            {
 +                fr->gblist.iinr[fr->gblist.nri]  = i;
 +                fr->gblist.shift[fr->gblist.nri] = list->shift;
 +                fr->gblist.nri++;
 +            
 +                for(k=0; k<list->naj; k++)
 +                {
 +                    /* Memory allocation for jjnr */
 +                    if(fr->gblist.nrj >= fr->gblist.maxnrj)
 +                    {
 +                        fr->gblist.maxnrj += over_alloc_large(fr->gblist.maxnrj);
 +                        
 +                        if (debug)
 +                        {
 +                            fprintf(debug,"Increasing GB neighbourlist j size to %d\n",fr->gblist.maxnrj);
 +                        }
 +                        
 +                        srenew(fr->gblist.jjnr,fr->gblist.maxnrj);
 +                    }
 +            
 +                    /* Put in list */
 +                                      if(i == list->aj[k])
 +                                      {
 +                                              gmx_incons("i == list->aj[k]");
 +                                      }
 +                    fr->gblist.jjnr[fr->gblist.nrj++] = list->aj[k];
 +                }
 +                              
 +                              fr->gblist.jindex[fr->gblist.nri] = fr->gblist.nrj;  
 +            }
 +              }
 +      }
 +
 +      
 +#ifdef SORT_GB_LIST
 +    for(i=0;i<fr->gblist.nri;i++)
 +    {
 +        nj0 = fr->gblist.jindex[i];
 +        nj1 = fr->gblist.jindex[i+1];
 +        ai  = fr->gblist.iinr[i];
 +        
 +        /* Temporary fix */
 +              for(j=nj0;j<nj1;j++)
 +              {
 +            if(fr->gblist.jjnr[j]<ai)
 +                fr->gblist.jjnr[j]+=fr->natoms_force;
 +        }
 +        qsort(fr->gblist.jjnr+nj0,nj1-nj0,sizeof(int),compare_int);
 +        /* Fix back */
 +        for(j=nj0;j<nj1;j++)
 +        {
 +            if(fr->gblist.jjnr[j]>=fr->natoms_force)
 +                fr->gblist.jjnr[j]-=fr->natoms_force;
 +        }
 +        
 +    }
 +#endif
 +      
 +    return 0;
 +}
 +
 +void make_local_gb(const t_commrec *cr, gmx_genborn_t *born, int gb_algorithm)
 +{
 +    int i,at0,at1;
 +    gmx_domdec_t *dd=NULL;
 +    
 +    if(DOMAINDECOMP(cr))
 +    {
 +        dd = cr->dd;
 +        at0 = 0;
 +        at1 = dd->nat_tot;
 +    }
 +    else
 +    {
 +        /* Single node or particle decomp (global==local), just copy pointers and return */
 +        if(gb_algorithm==egbSTILL)
 +        {
 +            born->gpol      = born->gpol_globalindex;
 +            born->vsolv     = born->vsolv_globalindex; 
 +            born->gb_radius = born->gb_radius_globalindex; 
 +        }
 +        else
 +        {
 +            born->param     = born->param_globalindex;
 +            born->gb_radius = born->gb_radius_globalindex; 
 +        }
 +        
 +        born->use = born->use_globalindex;
 +        
 +        return;
 +    }
 +    
 +    /* Reallocation of local arrays if necessary */
 +    /* fr->natoms_force is equal to dd->nat_tot */
 +    if (DOMAINDECOMP(cr) && dd->nat_tot > born->nalloc)
 +    {
 +        int nalloc;
 +
 +        nalloc = dd->nat_tot;
 +
 +        /* Arrays specific to different gb algorithms */
 +        if (gb_algorithm == egbSTILL)
 +        {
 +            srenew(born->gpol,  nalloc+3);
 +            srenew(born->vsolv, nalloc+3);
 +            srenew(born->gb_radius, nalloc+3);
 +            for(i=born->nalloc; (i<nalloc+3); i++) 
 +            {
 +                born->gpol[i] = 0;
 +                born->vsolv[i] = 0;
 +                born->gb_radius[i] = 0;
 +            }
 +        }
 +        else
 +        {
 +            srenew(born->param, nalloc+3);
 +            srenew(born->gb_radius, nalloc+3);
 +            for(i=born->nalloc; (i<nalloc+3); i++) 
 +            {
 +                born->param[i] = 0;
 +                born->gb_radius[i] = 0;
 +            }
 +        }
 +        
 +        /* All gb-algorithms use the array for vsites exclusions */
 +        srenew(born->use,    nalloc+3);
 +        for(i=born->nalloc; (i<nalloc+3); i++) 
 +        {
 +            born->use[i] = 0;
 +        }
 +
 +        born->nalloc = nalloc;
 +    }
 +    
 +    /* With dd, copy algorithm specific arrays */
 +    if(gb_algorithm==egbSTILL)
 +    {
 +        for(i=at0;i<at1;i++)
 +        {
 +            born->gpol[i]  = born->gpol_globalindex[dd->gatindex[i]];
 +            born->vsolv[i] = born->vsolv_globalindex[dd->gatindex[i]];
 +            born->gb_radius[i] = born->gb_radius_globalindex[dd->gatindex[i]];
 +            born->use[i]   = born->use_globalindex[dd->gatindex[i]];
 +        }
 +    }
 +    else
 +    {
 +        for(i=at0;i<at1;i++)
 +        {
 +            born->param[i]     = born->param_globalindex[dd->gatindex[i]];
 +            born->gb_radius[i] = born->gb_radius_globalindex[dd->gatindex[i]];
 +            born->use[i]       = born->use_globalindex[dd->gatindex[i]];
 +        }
 +    }
 +}
 +
index 33851e0bd310b978555137e61006b4e6dd6bb7df,0000000000000000000000000000000000000000..53b8439c03b93422c86b8c34067137020209c77a
mode 100644,000000..100644
--- /dev/null
@@@ -1,229 -1,0 +1,231 @@@
- void gmx_iterate_init(gmx_iterate_t *iterate,gmx_bool bIterate)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "typedefs.h"
 +#include "gmx_fatal.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "types/iteratedconstraints.h"
 +
 +#ifdef GMX_DOUBLE
 +#define CONVERGEITER  0.000000001
 +#define CLOSE_ENOUGH  0.000001000
 +#else
 +#define CONVERGEITER  0.0001
 +#define CLOSE_ENOUGH  0.0050
 +#endif
 +
 +/* we want to keep track of the close calls.  If there are too many, there might be some other issues.
 +   so we make sure that it's either less than some predetermined number, or if more than that number,
 +   only some small fraction of the total. */
 +#define MAX_NUMBER_CLOSE        50
 +#define FRACTION_CLOSE       0.001
 +  
 +/* maximum length of cyclic traps to check, emerging from limited numerical precision  */
 +#define CYCLEMAX            20
 +
-     iterate->bIterate = bIterate;
++void gmx_iterate_init(gmx_iterate_t *iterate,gmx_bool bSetIterationActive)
 +{
 +    int i;
 +
 +    iterate->iter_i = 0;
-             iterate->bIterate = FALSE;
++    iterate->bIterationActive = bSetIterationActive;
 +    iterate->num_close = 0;
 +    for (i=0;i<MAXITERCONST+2;i++) 
 +    {
 +        iterate->allrelerr[i] = 0;
 +    }
 +}
 +
 +gmx_bool done_iterating(const t_commrec *cr,FILE *fplog, int nsteps, gmx_iterate_t *iterate, gmx_bool bFirstIterate, real fom, real *newf) 
 +{    
 +    /* monitor convergence, and use a secant search to propose new
 +       values.  
 +                                                                  x_{i} - x_{i-1}
 +       The secant method computes x_{i+1} = x_{i} - f(x_{i}) * ---------------------
 +                                                                f(x_{i}) - f(x_{i-1})
 +       
 +       The function we are trying to zero is fom-x, where fom is the
 +       "figure of merit" which is the pressure (or the veta value) we
 +       would get by putting in an old value of the pressure or veta into
 +       the incrementor function for the step or half step.  I have
 +       verified that this gives the same answer as self consistent
 +       iteration, usually in many fewer steps, especially for small tau_p.
 +       
 +       We could possibly eliminate an iteration with proper use
 +       of the value from the previous step, but that would take a bit
 +       more bookkeeping, especially for veta, since tests indicate the
 +       function of veta on the last step is not sufficiently close to
 +       guarantee convergence this step. This is
 +       good enough for now.  On my tests, I could use tau_p down to
 +       0.02, which is smaller that would ever be necessary in
 +       practice. Generally, 3-5 iterations will be sufficient */
 +
 +    real relerr,err,xmin;
 +    int i;
 +    gmx_bool incycle;
 +    
 +    if (bFirstIterate) 
 +    {
 +        iterate->x = fom;
 +        iterate->f = fom-iterate->x;
 +        iterate->xprev = 0;
 +        iterate->fprev = 0;
 +        *newf = fom;
 +    } 
 +    else 
 +    {
 +        iterate->f = fom-iterate->x; /* we want to zero this difference */
 +        if ((iterate->iter_i > 1) && (iterate->iter_i < MAXITERCONST)) 
 +        {
 +            if (iterate->f==iterate->fprev) 
 +            {
 +                *newf = iterate->f;
 +            } 
 +            else 
 +            {
 +                *newf = iterate->x - (iterate->x-iterate->xprev)*(iterate->f)/(iterate->f-iterate->fprev); 
 +            }
 +        } 
 +        else 
 +        {
 +            /* just use self-consistent iteration the first step to initialize, or 
 +               if it's not converging (which happens occasionally -- need to investigate why) */
 +            *newf = fom; 
 +        }
 +    }
 +    /* Consider a slight shortcut allowing us to exit one sooner -- we check the
 +       difference between the closest of x and xprev to the new
 +       value. To be 100% certain, we should check the difference between
 +       the last result, and the previous result, or
 +       
 +       relerr = (fabs((x-xprev)/fom));
 +       
 +       but this is pretty much never necessary under typical conditions.
 +       Checking numerically, it seems to lead to almost exactly the same
 +       trajectories, but there are small differences out a few decimal
 +       places in the pressure, and eventually in the v_eta, but it could
 +       save an interation.
 +       
 +       if (fabs(*newf-x) < fabs(*newf - xprev)) { xmin = x;} else { xmin = xprev;}
 +       relerr = (fabs((*newf-xmin) / *newf));
 +    */
 +    
 +    err = fabs((iterate->f-iterate->fprev));
 +    relerr = fabs(err/fom);
 +
 +    iterate->allrelerr[iterate->iter_i] = relerr;
 +    
 +    if (iterate->iter_i > 0) 
 +    {
 +        if (debug) 
 +        {
 +            fprintf(debug,"Iterating NPT constraints: %6i %20.12f%14.6g%20.12f\n",
 +                    iterate->iter_i,fom,relerr,*newf);
 +        }
 +        
 +        if ((relerr < CONVERGEITER) || (err < CONVERGEITER) || (fom==0) || ((iterate->x == iterate->xprev) && iterate->iter_i > 1))
 +        {
++            iterate->bIterationActive = FALSE;
 +            if (debug) 
 +            {
 +                fprintf(debug,"Iterating NPT constraints: CONVERGED\n");
 +            }
 +            return TRUE;
 +        }
 +        if (iterate->iter_i > MAXITERCONST)
 +        {
 +            if (relerr < CLOSE_ENOUGH)
 +            {
 +                incycle = FALSE;
 +                for (i=1;i<CYCLEMAX;i++) {
 +                    if ((iterate->allrelerr[iterate->iter_i-(1+i)] == iterate->allrelerr[iterate->iter_i-1]) &&
 +                        (iterate->allrelerr[iterate->iter_i-(1+i)] == iterate->allrelerr[iterate->iter_i-(1+2*i)])) {
 +                        incycle = TRUE;
 +                        if (debug) 
 +                        {
 +                            fprintf(debug,"Exiting from an NPT iterating cycle of length %d\n",i);
 +                        }
 +                        break;
 +                    }
 +                }
 +                
 +                if (incycle) {
 +                    /* step 1: trapped in a numerical attractor */
 +                    /* we are trapped in a numerical attractor, and can't converge any more, and are close to the final result.
 +                       Better to give up convergence here than have the simulation die.
 +                    */
 +                    iterate->num_close++;
++                    iterate->bIterationActive = FALSE;
 +                    return TRUE;
 +                } 
 +                else 
 +                {
 +                    /* Step #2: test if we are reasonably close for other reasons, then monitor the number.  If not, die */
 +                    
 +                    /* how many close calls have we had?  If less than a few, we're OK */
 +                    if (iterate->num_close < MAX_NUMBER_CLOSE) 
 +                    {
 +                        md_print_warn(cr,fplog,"Slight numerical convergence deviation with NPT at step %d, relative error only %10.5g, likely not a problem, continuing\n",nsteps,relerr);
 +                        iterate->num_close++;
++                        iterate->bIterationActive = FALSE;
 +                        return TRUE;
 +                        /* if more than a few, check the total fraction.  If too high, die. */
 +                    } else if (iterate->num_close/(double)nsteps > FRACTION_CLOSE) {
 +                        gmx_fatal(FARGS,"Could not converge NPT constraints, too many exceptions (%d%%\n",iterate->num_close/(double)nsteps);
 +                    } 
 +                }
 +            }
 +            else 
 +            {
 +                gmx_fatal(FARGS,"Could not converge NPT constraints\n");
 +            }
 +        }
 +    }
 +    
 +    iterate->xprev = iterate->x;
 +    iterate->x = *newf;
 +    iterate->fprev = iterate->f;
 +    iterate->iter_i++;
 +    
 +    return FALSE;
 +}
 +
index aa6cfad96f77e7aefa10760a013bd47e5c1e7a5b,0000000000000000000000000000000000000000..1f77c242d1b11d0c5fd69199464c81405be0996a
mode 100644,000000..100644
--- /dev/null
@@@ -1,1450 -1,0 +1,1557 @@@
-     t_lambda *fep;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <float.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "mdebin.h"
 +#include "smalloc.h"
 +#include "physics.h"
 +#include "enxio.h"
 +#include "vec.h"
 +#include "disre.h"
 +#include "main.h"
 +#include "network.h"
 +#include "names.h"
 +#include "orires.h"
 +#include "constr.h"
 +#include "mtop_util.h"
 +#include "xvgr.h"
 +#include "gmxfio.h"
 +#include "macros.h"
 +#include "mdrun.h"
 +#include "mdebin_bar.h"
 +
 +
 +static const char *conrmsd_nm[] = { "Constr. rmsd", "Constr.2 rmsd" };
 +
 +static const char *boxs_nm[] = { "Box-X", "Box-Y", "Box-Z" };
 +
 +static const char *tricl_boxs_nm[] = {
 +    "Box-XX", "Box-YY", "Box-ZZ",
 +    "Box-YX", "Box-ZX", "Box-ZY"
 +};
 +
 +static const char *vol_nm[] = { "Volume" };
 +
 +static const char *dens_nm[] = {"Density" };
 +
 +static const char *pv_nm[] = {"pV" };
 +
 +static const char *enthalpy_nm[] = {"Enthalpy" };
 +
 +static const char *boxvel_nm[] = {
 +    "Box-Vel-XX", "Box-Vel-YY", "Box-Vel-ZZ",
 +    "Box-Vel-YX", "Box-Vel-ZX", "Box-Vel-ZY"
 +};
 +
 +#define NBOXS asize(boxs_nm)
 +#define NTRICLBOXS asize(tricl_boxs_nm)
 +
 +t_mdebin *init_mdebin(ener_file_t fp_ene,
 +                      const gmx_mtop_t *mtop,
 +                      const t_inputrec *ir,
 +                      FILE *fp_dhdl)
 +{
 +    const char *ener_nm[F_NRE];
 +    static const char *vir_nm[] = {
 +        "Vir-XX", "Vir-XY", "Vir-XZ",
 +        "Vir-YX", "Vir-YY", "Vir-YZ",
 +        "Vir-ZX", "Vir-ZY", "Vir-ZZ"
 +    };
 +    static const char *sv_nm[] = {
 +        "ShakeVir-XX", "ShakeVir-XY", "ShakeVir-XZ",
 +        "ShakeVir-YX", "ShakeVir-YY", "ShakeVir-YZ",
 +        "ShakeVir-ZX", "ShakeVir-ZY", "ShakeVir-ZZ"
 +    };
 +    static const char *fv_nm[] = {
 +        "ForceVir-XX", "ForceVir-XY", "ForceVir-XZ",
 +        "ForceVir-YX", "ForceVir-YY", "ForceVir-YZ",
 +        "ForceVir-ZX", "ForceVir-ZY", "ForceVir-ZZ"
 +    };
 +    static const char *pres_nm[] = {
 +        "Pres-XX","Pres-XY","Pres-XZ",
 +        "Pres-YX","Pres-YY","Pres-YZ",
 +        "Pres-ZX","Pres-ZY","Pres-ZZ"
 +    };
 +    static const char *surft_nm[] = {
 +        "#Surf*SurfTen"
 +    };
 +    static const char *mu_nm[] = {
 +        "Mu-X", "Mu-Y", "Mu-Z"
 +    };
 +    static const char *vcos_nm[] = {
 +        "2CosZ*Vel-X"
 +    };
 +    static const char *visc_nm[] = {
 +        "1/Viscosity"
 +    };
 +    static const char *baro_nm[] = {
 +        "Barostat"
 +    };
 +
 +    char     **grpnms;
 +    const gmx_groups_t *groups;
 +    char     **gnm;
 +    char     buf[256];
 +    const char     *bufi;
 +    t_mdebin *md;
 +    int      i,j,ni,nj,n,nh,k,kk,ncon,nset;
 +    gmx_bool     bBHAM,bNoseHoover,b14;
 +
 +    snew(md,1);
 +
 +    md->bVir=TRUE;
 +    md->bPress=TRUE;
 +    md->bSurft=TRUE;
 +    md->bMu=TRUE;
 +
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        md->delta_t = ir->delta_t;
 +    }
 +    else
 +    {
 +        md->delta_t = 0;
 +    }
 +
 +    groups = &mtop->groups;
 +
 +    bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +    b14   = (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +             gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0);
 +
 +    ncon = gmx_mtop_ftype_count(mtop,F_CONSTR);
 +    nset = gmx_mtop_ftype_count(mtop,F_SETTLE);
 +    md->bConstr    = (ncon > 0 || nset > 0);
 +    md->bConstrVir = FALSE;
 +    if (md->bConstr) {
 +        if (ncon > 0 && ir->eConstrAlg == econtLINCS) {
 +            if (ir->eI == eiSD2)
 +                md->nCrmsd = 2;
 +            else
 +                md->nCrmsd = 1;
 +        }
 +        md->bConstrVir = (getenv("GMX_CONSTRAINTVIR") != NULL);
 +    } else {
 +        md->nCrmsd = 0;
 +    }
 +
 +    /* Energy monitoring */
 +    for(i=0;i<egNR;i++)
 +    {
 +        md->bEInd[i]=FALSE;
 +    }
 +
++    /* Even though the OpenMM build has moved to contrib, it's not
++     * practical to move/remove this code fragment, because of the
++     * fundamental mess that is the GROMACS library structure. */
 +#ifndef GMX_OPENMM
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        md->bEner[i] = FALSE;
 +        if (i == F_LJ)
 +            md->bEner[i] = !bBHAM;
 +        else if (i == F_BHAM)
 +            md->bEner[i] = bBHAM;
 +        else if (i == F_EQM)
 +            md->bEner[i] = ir->bQMMM;
 +        else if (i == F_COUL_LR)
 +            md->bEner[i] = (ir->rcoulomb > ir->rlist);
 +        else if (i == F_LJ_LR)
 +            md->bEner[i] = (!bBHAM && ir->rvdw > ir->rlist);
 +        else if (i == F_BHAM_LR)
 +            md->bEner[i] = (bBHAM && ir->rvdw > ir->rlist);
 +        else if (i == F_RF_EXCL)
 +            md->bEner[i] = (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC && ir->cutoff_scheme == ecutsGROUP);
 +        else if (i == F_COUL_RECIP)
 +            md->bEner[i] = EEL_FULL(ir->coulombtype);
 +        else if (i == F_LJ14)
 +            md->bEner[i] = b14;
 +        else if (i == F_COUL14)
 +            md->bEner[i] = b14;
 +        else if (i == F_LJC14_Q || i == F_LJC_PAIRS_NB)
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_DVDL_COUL && ir->fepvals->separate_dvdl[efptCOUL]) ||
 +                 (i == F_DVDL_VDW  && ir->fepvals->separate_dvdl[efptVDW]) ||
 +                 (i == F_DVDL_BONDED && ir->fepvals->separate_dvdl[efptBONDED]) ||
 +                 (i == F_DVDL_RESTRAINT && ir->fepvals->separate_dvdl[efptRESTRAINT]) ||
 +                 (i == F_DKDL && ir->fepvals->separate_dvdl[efptMASS]) ||
 +                 (i == F_DVDL && ir->fepvals->separate_dvdl[efptFEP]))
 +            md->bEner[i] = (ir->efep != efepNO);
 +        else if ((interaction_function[i].flags & IF_VSITE) ||
 +                 (i == F_CONSTR) || (i == F_CONSTRNC) || (i == F_SETTLE))
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_COUL_SR) || (i == F_EPOT) || (i == F_PRES)  || (i==F_EQM))
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_GBPOL) && ir->implicit_solvent==eisGBSA)
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_NPSOLVATION) && ir->implicit_solvent==eisGBSA && (ir->sa_algorithm != esaNO))
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_GB12) || (i == F_GB13) || (i == F_GB14))
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_ETOT) || (i == F_EKIN) || (i == F_TEMP))
 +            md->bEner[i] = EI_DYNAMICS(ir->eI);
 +        else if (i == F_DISPCORR || i == F_PDISPCORR)
 +            md->bEner[i] = (ir->eDispCorr != edispcNO);
 +        else if (i == F_DISRESVIOL)
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,F_DISRES) > 0);
 +        else if (i == F_ORIRESDEV)
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0);
 +        else if (i == F_CONNBONDS)
 +            md->bEner[i] = FALSE;
 +        else if (i == F_COM_PULL)
 +            md->bEner[i] = (ir->ePull == epullUMBRELLA || ir->ePull == epullCONST_F || ir->bRot);
 +        else if (i == F_ECONSERVED)
 +            md->bEner[i] = ((ir->etc == etcNOSEHOOVER || ir->etc == etcVRESCALE) &&
 +                            (ir->epc == epcNO || ir->epc==epcMTTK));
 +        else
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,i) > 0);
 +    }
 +#else
 +    /* OpenMM always produces only the following 4 energy terms */
 +    md->bEner[F_EPOT] = TRUE;
 +    md->bEner[F_EKIN] = TRUE;
 +    md->bEner[F_ETOT] = TRUE;
 +    md->bEner[F_TEMP] = TRUE;
 +#endif
 +
 +    /* for adress simulations, most energy terms are not meaningfull, and thus disabled*/
 +    if (ir->bAdress && !debug) {
 +        for (i = 0; i < F_NRE; i++) {
 +            md->bEner[i] = FALSE;
 +            if(i == F_EKIN){ md->bEner[i] = TRUE;}
 +            if(i == F_TEMP){ md->bEner[i] = TRUE;}
 +        }
 +        md->bVir=FALSE;
 +        md->bPress=FALSE;
 +        md->bSurft=FALSE;
 +        md->bMu=FALSE;
 +    }
 +
 +    md->f_nre=0;
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        if (md->bEner[i])
 +        {
 +            ener_nm[md->f_nre]=interaction_function[i].longname;
 +            md->f_nre++;
 +        }
 +    }
 +
 +    md->epc = ir->epc;
 +    md->bDiagPres = !TRICLINIC(ir->ref_p);
 +    md->ref_p = (ir->ref_p[XX][XX]+ir->ref_p[YY][YY]+ir->ref_p[ZZ][ZZ])/DIM;
 +    md->bTricl = TRICLINIC(ir->compress) || TRICLINIC(ir->deform);
 +    md->bDynBox = DYNAMIC_BOX(*ir);
 +    md->etc = ir->etc;
 +    md->bNHC_trotter = IR_NVT_TROTTER(ir);
 +    md->bPrintNHChains = ir-> bPrintNHChains;
 +    md->bMTTK = (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir));
 +    md->bMu = NEED_MUTOT(*ir);
 +
 +    md->ebin  = mk_ebin();
 +    /* Pass NULL for unit to let get_ebin_space determine the units
 +     * for interaction_function[i].longname
 +     */
 +    md->ie    = get_ebin_space(md->ebin,md->f_nre,ener_nm,NULL);
 +    if (md->nCrmsd)
 +    {
 +        /* This should be called directly after the call for md->ie,
 +         * such that md->iconrmsd follows directly in the list.
 +         */
 +        md->iconrmsd = get_ebin_space(md->ebin,md->nCrmsd,conrmsd_nm,"");
 +    }
 +    if (md->bDynBox)
 +    {
 +        md->ib    = get_ebin_space(md->ebin,
 +                                   md->bTricl ? NTRICLBOXS : NBOXS,
 +                                   md->bTricl ? tricl_boxs_nm : boxs_nm,
 +                                   unit_length);
 +        md->ivol  = get_ebin_space(md->ebin, 1, vol_nm,  unit_volume);
 +        md->idens = get_ebin_space(md->ebin, 1, dens_nm, unit_density_SI);
 +        if (md->bDiagPres)
 +        {
 +            md->ipv   = get_ebin_space(md->ebin, 1, pv_nm,   unit_energy);
 +            md->ienthalpy = get_ebin_space(md->ebin, 1, enthalpy_nm,   unit_energy);
 +        }
 +    }
 +    if (md->bConstrVir)
 +    {
 +        md->isvir = get_ebin_space(md->ebin,asize(sv_nm),sv_nm,unit_energy);
 +        md->ifvir = get_ebin_space(md->ebin,asize(fv_nm),fv_nm,unit_energy);
 +    }
 +    if (md->bVir)
 +        md->ivir   = get_ebin_space(md->ebin,asize(vir_nm),vir_nm,unit_energy);
 +    if (md->bPress)
 +        md->ipres  = get_ebin_space(md->ebin,asize(pres_nm),pres_nm,unit_pres_bar);
 +    if (md->bSurft)
 +        md->isurft = get_ebin_space(md->ebin,asize(surft_nm),surft_nm,
 +                                unit_surft_bar);
 +    if (md->epc == epcPARRINELLORAHMAN || md->epc == epcMTTK)
 +    {
 +        md->ipc = get_ebin_space(md->ebin,md->bTricl ? 6 : 3,
 +                                 boxvel_nm,unit_vel);
 +    }
 +    if (md->bMu)
 +    {
 +        md->imu    = get_ebin_space(md->ebin,asize(mu_nm),mu_nm,unit_dipole_D);
 +    }
 +    if (ir->cos_accel != 0)
 +    {
 +        md->ivcos = get_ebin_space(md->ebin,asize(vcos_nm),vcos_nm,unit_vel);
 +        md->ivisc = get_ebin_space(md->ebin,asize(visc_nm),visc_nm,
 +                                   unit_invvisc_SI);
 +    }
 +
 +    /* Energy monitoring */
 +    for(i=0;i<egNR;i++)
 +    {
 +        md->bEInd[i] = FALSE;
 +    }
 +    md->bEInd[egCOULSR] = TRUE;
 +    md->bEInd[egLJSR  ] = TRUE;
 +
 +    if (ir->rcoulomb > ir->rlist)
 +    {
 +        md->bEInd[egCOULLR] = TRUE;
 +    }
 +    if (!bBHAM)
 +    {
 +        if (ir->rvdw > ir->rlist)
 +        {
 +            md->bEInd[egLJLR]   = TRUE;
 +        }
 +    }
 +    else
 +    {
 +        md->bEInd[egLJSR]   = FALSE;
 +        md->bEInd[egBHAMSR] = TRUE;
 +        if (ir->rvdw > ir->rlist)
 +        {
 +            md->bEInd[egBHAMLR]   = TRUE;
 +        }
 +    }
 +    if (b14)
 +    {
 +        md->bEInd[egLJ14] = TRUE;
 +        md->bEInd[egCOUL14] = TRUE;
 +    }
 +    md->nEc=0;
 +    for(i=0; (i<egNR); i++)
 +    {
 +        if (md->bEInd[i])
 +        {
 +            md->nEc++;
 +        }
 +    }
 +
 +    n=groups->grps[egcENER].nr;
 +    /* for adress simulations, most energy terms are not meaningfull, and thus disabled*/
 +    if (!ir->bAdress){
 +        /*standard simulation*/
 +        md->nEg=n;
 +        md->nE=(n*(n+1))/2;
 +    }
 +    else if (!debug) {
 +        /*AdResS simulation*/
 +       md->nU=0;
 +       md->nEg=0;
 +       md->nE=0;
 +       md->nEc=0;
 +       md->isvir=FALSE;
 +    }
 +    snew(md->igrp,md->nE);
 +    if (md->nE > 1)
 +    {
 +        n=0;
 +        snew(gnm,md->nEc);
 +        for(k=0; (k<md->nEc); k++)
 +        {
 +            snew(gnm[k],STRLEN);
 +        }
 +        for(i=0; (i<groups->grps[egcENER].nr); i++)
 +        {
 +            ni=groups->grps[egcENER].nm_ind[i];
 +            for(j=i; (j<groups->grps[egcENER].nr); j++)
 +            {
 +                nj=groups->grps[egcENER].nm_ind[j];
 +                for(k=kk=0; (k<egNR); k++)
 +                {
 +                    if (md->bEInd[k])
 +                    {
 +                        sprintf(gnm[kk],"%s:%s-%s",egrp_nm[k],
 +                                *(groups->grpname[ni]),*(groups->grpname[nj]));
 +                        kk++;
 +                    }
 +                }
 +                md->igrp[n]=get_ebin_space(md->ebin,md->nEc,
 +                                           (const char **)gnm,unit_energy);
 +                n++;
 +            }
 +        }
 +        for(k=0; (k<md->nEc); k++)
 +        {
 +            sfree(gnm[k]);
 +        }
 +        sfree(gnm);
 +
 +        if (n != md->nE)
 +        {
 +            gmx_incons("Number of energy terms wrong");
 +        }
 +    }
 +
 +    md->nTC=groups->grps[egcTC].nr;
 +    md->nNHC = ir->opts.nhchainlength; /* shorthand for number of NH chains */
 +    if (md->bMTTK)
 +    {
 +        md->nTCP = 1;  /* assume only one possible coupling system for barostat
 +                          for now */
 +    }
 +    else
 +    {
 +        md->nTCP = 0;
 +    }
 +    if (md->etc == etcNOSEHOOVER)
 +    {
 +        if (md->bNHC_trotter)
 +        {
 +            md->mde_n = 2*md->nNHC*md->nTC;
 +        }
 +        else
 +        {
 +            md->mde_n = 2*md->nTC;
 +        }
 +        if (md->epc == epcMTTK)
 +        {
 +            md->mdeb_n = 2*md->nNHC*md->nTCP;
 +        }
 +    } else {
 +        md->mde_n = md->nTC;
 +        md->mdeb_n = 0;
 +    }
 +
 +    snew(md->tmp_r,md->mde_n);
 +    snew(md->tmp_v,md->mde_n);
 +    snew(md->grpnms,md->mde_n);
 +    grpnms = md->grpnms;
 +
 +    for(i=0; (i<md->nTC); i++)
 +    {
 +        ni=groups->grps[egcTC].nm_ind[i];
 +        sprintf(buf,"T-%s",*(groups->grpname[ni]));
 +        grpnms[i]=strdup(buf);
 +    }
 +    md->itemp=get_ebin_space(md->ebin,md->nTC,(const char **)grpnms,
 +                             unit_temp_K);
 +
 +    if (md->etc == etcNOSEHOOVER)
 +    {
 +        if (md->bPrintNHChains)
 +        {
 +            if (md->bNHC_trotter)
 +            {
 +                for(i=0; (i<md->nTC); i++)
 +                {
 +                    ni=groups->grps[egcTC].nm_ind[i];
 +                    bufi = *(groups->grpname[ni]);
 +                    for(j=0; (j<md->nNHC); j++)
 +                    {
 +                        sprintf(buf,"Xi-%d-%s",j,bufi);
 +                        grpnms[2*(i*md->nNHC+j)]=strdup(buf);
 +                        sprintf(buf,"vXi-%d-%s",j,bufi);
 +                        grpnms[2*(i*md->nNHC+j)+1]=strdup(buf);
 +                    }
 +                }
 +                md->itc=get_ebin_space(md->ebin,md->mde_n,
 +                                       (const char **)grpnms,unit_invtime);
 +                if (md->bMTTK)
 +                {
 +                    for(i=0; (i<md->nTCP); i++)
 +                    {
 +                        bufi = baro_nm[0];  /* All barostat DOF's together for now. */
 +                        for(j=0; (j<md->nNHC); j++)
 +                        {
 +                            sprintf(buf,"Xi-%d-%s",j,bufi);
 +                            grpnms[2*(i*md->nNHC+j)]=strdup(buf);
 +                            sprintf(buf,"vXi-%d-%s",j,bufi);
 +                            grpnms[2*(i*md->nNHC+j)+1]=strdup(buf);
 +                        }
 +                    }
 +                    md->itcb=get_ebin_space(md->ebin,md->mdeb_n,
 +                                            (const char **)grpnms,unit_invtime);
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; (i<md->nTC); i++)
 +                {
 +                    ni=groups->grps[egcTC].nm_ind[i];
 +                    bufi = *(groups->grpname[ni]);
 +                    sprintf(buf,"Xi-%s",bufi);
 +                    grpnms[2*i]=strdup(buf);
 +                    sprintf(buf,"vXi-%s",bufi);
 +                    grpnms[2*i+1]=strdup(buf);
 +                }
 +                md->itc=get_ebin_space(md->ebin,md->mde_n,
 +                                       (const char **)grpnms,unit_invtime);
 +            }
 +        }
 +    }
 +    else if (md->etc == etcBERENDSEN || md->etc == etcYES ||
 +             md->etc == etcVRESCALE)
 +    {
 +        for(i=0; (i<md->nTC); i++)
 +        {
 +            ni=groups->grps[egcTC].nm_ind[i];
 +            sprintf(buf,"Lamb-%s",*(groups->grpname[ni]));
 +            grpnms[i]=strdup(buf);
 +        }
 +        md->itc=get_ebin_space(md->ebin,md->mde_n,(const char **)grpnms,"");
 +    }
 +
 +    sfree(grpnms);
 +
 +
 +    md->nU=groups->grps[egcACC].nr;
 +    if (md->nU > 1)
 +    {
 +        snew(grpnms,3*md->nU);
 +        for(i=0; (i<md->nU); i++)
 +        {
 +            ni=groups->grps[egcACC].nm_ind[i];
 +            sprintf(buf,"Ux-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+XX]=strdup(buf);
 +            sprintf(buf,"Uy-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+YY]=strdup(buf);
 +            sprintf(buf,"Uz-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+ZZ]=strdup(buf);
 +        }
 +        md->iu=get_ebin_space(md->ebin,3*md->nU,(const char **)grpnms,unit_vel);
 +        sfree(grpnms);
 +    }
 +
 +    if ( fp_ene )
 +    {
 +        do_enxnms(fp_ene,&md->ebin->nener,&md->ebin->enm);
 +    }
 +
 +    md->print_grpnms=NULL;
 +
 +    /* check whether we're going to write dh histograms */
 +    md->dhc=NULL;
 +    if (ir->fepvals->separate_dhdl_file == esepdhdlfileNO )
 +    {
 +        /* Currently dh histograms are only written with dynamics */
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            snew(md->dhc, 1);
 +
 +            mde_delta_h_coll_init(md->dhc, ir);
 +        }
 +        md->fp_dhdl = NULL;
++        snew(md->dE,ir->fepvals->n_lambda);
 +    }
 +    else
 +    {
 +        md->fp_dhdl = fp_dhdl;
++        snew(md->dE,ir->fepvals->n_lambda);
 +    }
 +    if (ir->bSimTemp) {
 +        int i;
 +        snew(md->temperatures,ir->fepvals->n_lambda);
 +        for (i=0;i<ir->fepvals->n_lambda;i++)
 +        {
 +            md->temperatures[i] = ir->simtempvals->temperatures[i];
 +        }
 +    }
 +    return md;
 +}
 +
++/* print a lambda vector to a string
++   fep = the inputrec's FEP input data
++   i = the index of the lambda vector
++   get_native_lambda = whether to print the native lambda
++   get_names = whether to print the names rather than the values
++   str = the pre-allocated string buffer to print to. */
++static void print_lambda_vector(t_lambda *fep, int i,
++                                gmx_bool get_native_lambda, gmx_bool get_names,
++                                char *str)
++{
++    size_t nps=0, np;
++    int j,k=0;
++    int Nsep=0;
++
++    for (j=0;j<efptNR;j++)
++    {
++        if (fep->separate_dvdl[j])
++            Nsep ++;
++    }
++    str[0]=0; /* reset the string */
++    if (Nsep > 1)
++    {
++        str += sprintf(str, "("); /* set the opening parenthesis*/
++    }
++    for (j=0;j<efptNR;j++)
++    {
++        if (fep->separate_dvdl[j])
++        {
++            double lam;
++            if (!get_names)
++            {
++                if (get_native_lambda && fep->init_lambda >= 0)
++                {
++                    str += sprintf(str,"%.4f", fep->init_lambda);
++                }
++                else
++                {
++                    str += sprintf(str,"%.4f", fep->all_lambda[j][i]);
++                }
++            }
++            else
++            {
++                str += sprintf(str,"%s", efpt_singular_names[j]);
++            }
++            /* print comma for the next item */
++            if (k<Nsep-1)
++            {
++                str += sprintf(str,", ");
++            }
++            k++;
++        }
++    }
++    if (Nsep > 1)
++    {
++        /* and add the closing parenthesis */
++        str += sprintf(str, ")");
++    }
++}
++
++
 +extern FILE *open_dhdl(const char *filename,const t_inputrec *ir,
 +                       const output_env_t oenv)
 +{
 +    FILE *fp;
 +    const char *dhdl="dH/d\\lambda",*deltag="\\DeltaH",*lambda="\\lambda",
 +        *lambdastate="\\lambda state",*remain="remaining";
 +    char title[STRLEN],label_x[STRLEN],label_y[STRLEN];
 +    int  i,np,nps,nsets,nsets_de,nsetsbegin;
-     char buf[STRLEN];
++    int n_lambda_terms=0;
++    t_lambda *fep=ir->fepvals; /* for simplicity */
++    t_expanded *expand=ir->expandedvals;
 +    char **setname;
-     /* for simplicity */
-     fep = ir->fepvals;
++    char buf[STRLEN], lambda_vec_str[STRLEN], lambda_name_str[STRLEN];
 +    int bufplace=0;
 +
 +    int nsets_dhdl = 0;
 +    int s = 0;
 +    int nsetsextend;
++    gmx_bool write_pV = FALSE;
 +
-         if (fep->n_lambda == 0)
++    /* count the number of different lambda terms */
++    for (i=0;i<efptNR;i++)
++    {
++        if (fep->separate_dvdl[i])
++        {
++            n_lambda_terms++;
++        }
++    }
 +
 +    if (fep->n_lambda == 0)
 +    {
 +        sprintf(title,"%s",dhdl);
 +        sprintf(label_x,"Time (ps)");
 +        sprintf(label_y,"%s (%s %s)",
 +                dhdl,unit_energy,"[\\lambda]\\S-1\\N");
 +    }
 +    else
 +    {
 +        sprintf(title,"%s and %s",dhdl,deltag);
 +        sprintf(label_x,"Time (ps)");
 +        sprintf(label_y,"%s and %s (%s %s)",
 +                dhdl,deltag,unit_energy,"[\\8l\\4]\\S-1\\N");
 +    }
 +    fp = gmx_fio_fopen(filename,"w+");
 +    xvgr_header(fp,title,label_x,label_y,exvggtXNY,oenv);
 +
 +    if (!(ir->bSimTemp))
 +    {
 +        bufplace = sprintf(buf,"T = %g (K) ",
 +                ir->opts.ref_t[0]);
 +    }
 +    if (ir->efep != efepSLOWGROWTH)
 +    {
-             sprintf(&(buf[bufplace]),"%s = %g",
-                     lambda,fep->init_lambda);
++        if ( (fep->init_lambda >= 0)  && (n_lambda_terms == 1 ))
 +        {
-             sprintf(&(buf[bufplace]),"%s = %d",
-                     lambdastate,fep->init_fep_state);
++            /* compatibility output */
++            sprintf(&(buf[bufplace]),"%s = %.4f", lambda,fep->init_lambda);
 +        }
 +        else
 +        {
-     for (i=0;i<efptNR;i++)
++            print_lambda_vector(fep, fep->init_fep_state, TRUE, FALSE,
++                                lambda_vec_str);
++            print_lambda_vector(fep, fep->init_fep_state, TRUE, TRUE,
++                                lambda_name_str);
++            sprintf(&(buf[bufplace]),"%s %d: %s = %s",
++                    lambdastate,fep->init_fep_state,
++                    lambda_name_str, lambda_vec_str);
 +        }
 +    }
 +    xvgr_subtitle(fp,buf,oenv);
 +
-         if (fep->separate_dvdl[i]) {nsets_dhdl++;}
++
++    nsets_dhdl=0;
++    if (fep->dhdl_derivatives == edhdlderivativesYES)
 +    {
++        nsets_dhdl = n_lambda_terms;
 +    }
-     nsets_de = fep->n_lambda;
 +    /* count the number of delta_g states */
-     if (fep->n_lambda>0 && ir->bExpanded)
++    nsets_de = fep->lambda_stop_n - fep->lambda_start_n;
 +
 +    nsets = nsets_dhdl + nsets_de; /* dhdl + fep differences */
 +
-     if ((ir->epc!=epcNO) && (fep->n_lambda>0))
++    if (fep->n_lambda>0 && (expand->elmcmove > elmcmoveNO))
 +    {
 +        nsets += 1;   /*add fep state for expanded ensemble */
 +    }
 +
 +    if (fep->bPrintEnergy)
 +    {
 +        nsets += 1;  /* add energy to the dhdl as well */
 +    }
 +
 +    nsetsextend = nsets;
-         nsetsextend += 1; /* for PV term, other terms possible if required for the reduced potential (only needed with foreign lambda) */
++    if ((ir->epc!=epcNO) && (fep->n_lambda>0) && (fep->init_lambda < 0))
 +    {
-     if (ir->bExpanded)
++        nsetsextend += 1; /* for PV term, other terms possible if required for
++                             the reduced potential (only needed with foreign
++                             lambda, and only output when init_lambda is not
++                             set in order to maintain compatibility of the
++                             dhdl.xvg file) */
++        write_pV = TRUE;
 +    }
 +    snew(setname,nsetsextend);
 +
-     for (i=0;i<efptNR;i++)
++    if (expand->elmcmove > elmcmoveNO)
 +    {
 +        /* state for the fep_vals, if we have alchemical sampling */
 +        sprintf(buf,"%s","Thermodynamic state");
 +        setname[s] = strdup(buf);
 +        s+=1;
 +    }
 +
 +    if (fep->bPrintEnergy)
 +    {
 +        sprintf(buf,"%s (%s)","Energy",unit_energy);
 +        setname[s] = strdup(buf);
 +        s+=1;
 +    }
 +
-         if (fep->separate_dvdl[i]) {
-             sprintf(buf,"%s (%s)",dhdl,efpt_names[i]);
-             setname[s] = strdup(buf);
-             s+=1;
++    if (fep->dhdl_derivatives == edhdlderivativesYES)
 +    {
-         if (ir->bExpanded) {
++        for (i=0;i<efptNR;i++)
++        {
++            if (fep->separate_dvdl[i]) {
++
++                if ( (fep->init_lambda >= 0)  && (n_lambda_terms == 1 ))
++                {
++                    /* compatibility output */
++                    sprintf(buf,"%s %s %.4f",dhdl,lambda, fep->init_lambda);
++                }
++                else
++                {
++                    double lam=fep->init_lambda;
++                    if (fep->init_lambda < 0)
++                    {
++                        lam=fep->all_lambda[i][fep->init_fep_state];
++                    }
++                    sprintf(buf,"%s %s = %.4f",dhdl, efpt_singular_names[i],
++                            lam);
++                }
++                setname[s] = strdup(buf);
++                s+=1;
++            }
 +        }
 +    }
 +
 +    if (fep->n_lambda > 0)
 +    {
 +        /* g_bar has to determine the lambda values used in this simulation
 +         * from this xvg legend.
 +         */
 +
-         for(s=nsetsbegin; s<nsets; s++)
++        if (expand->elmcmove > elmcmoveNO) {
 +            nsetsbegin = 1;  /* for including the expanded ensemble */
 +        } else {
 +            nsetsbegin = 0;
 +        }
 +
 +        if (fep->bPrintEnergy)
 +        {
 +            nsetsbegin += 1;
 +        }
 +        nsetsbegin += nsets_dhdl;
 +
-             nps = sprintf(buf,"%s %s (",deltag,lambda);
-             for (i=0;i<efptNR;i++)
++        for(i=fep->lambda_start_n; i<fep->lambda_stop_n; i++)
 +        {
-                 if (fep->separate_dvdl[i])
-                 {
-                     np = sprintf(&buf[nps],"%g,",fep->all_lambda[i][s-(nsetsbegin)]);
-                     nps += np;
-                 }
++            print_lambda_vector(fep, i, FALSE, FALSE, lambda_vec_str);
++            if ( (fep->init_lambda >= 0)  && (n_lambda_terms == 1 ))
 +            {
-             if (ir->bSimTemp)
++                /* for compatible dhdl.xvg files */
++                nps = sprintf(buf,"%s %s %s",deltag,lambda, lambda_vec_str);
 +            }
-                 /* print the temperature for this state if doing simulated annealing */
-                 sprintf(&buf[nps],"T = %g (%s))",ir->simtempvals->temperatures[s-(nsetsbegin)],unit_temp_K);
++            else
 +            {
-             else
++                nps = sprintf(buf,"%s %s to %s",deltag,lambda, lambda_vec_str);
 +            }
-                 sprintf(&buf[nps-1],")");  /* -1 to overwrite the last comma */
++
++            if (ir->bSimTemp)
 +            {
-         if (ir->epc!=epcNO) {
++                /* print the temperature for this state if doing simulated annealing */
++                sprintf(&buf[nps],"T = %g (%s)",
++                        ir->simtempvals->temperatures[s-(nsetsbegin)],
++                        unit_temp_K);
 +            }
 +            setname[s] = strdup(buf);
++            s++;
 +        }
-             setname[nsetsextend-1] = strdup(buf);  /* the first entry after nsets */
++        if (write_pV) {
 +            np = sprintf(buf,"pV (%s)",unit_energy);
-     double *dE=NULL;
++            setname[nsetsextend-1] = strdup(buf);  /* the first entry after
++                                                      nsets */
 +        }
 +
 +        xvgr_legend(fp,nsetsextend,(const char **)setname,oenv);
 +
 +        for(s=0; s<nsetsextend; s++)
 +        {
 +            sfree(setname[s]);
 +        }
 +        sfree(setname);
 +    }
 +
 +    return fp;
 +}
 +
 +static void copy_energy(t_mdebin *md, real e[],real ecpy[])
 +{
 +    int i,j;
 +
 +    for(i=j=0; (i<F_NRE); i++)
 +        if (md->bEner[i])
 +            ecpy[j++] = e[i];
 +    if (j != md->f_nre)
 +        gmx_incons("Number of energy terms wrong");
 +}
 +
 +void upd_mdebin(t_mdebin *md,
 +                gmx_bool bDoDHDL,
 +                gmx_bool bSum,
 +                double time,
 +                real tmass,
 +                gmx_enerdata_t *enerd,
 +                t_state *state,
 +                t_lambda *fep,
 +                t_expanded *expand,
 +                matrix  box,
 +                tensor svir,
 +                tensor fvir,
 +                tensor vir,
 +                tensor pres,
 +                gmx_ekindata_t *ekind,
 +                rvec mu_tot,
 +                gmx_constr_t constr)
 +{
 +    int    i,j,k,kk,m,n,gid;
 +    real   crmsd[2],tmp6[6];
 +    real   bs[NTRICLBOXS],vol,dens,pv,enthalpy;
 +    real   eee[egNR];
 +    real   ecopy[F_NRE];
 +    double store_dhdl[efptNR];
-     if ((md->fp_dhdl || md->dhc) && bDoDHDL && (enerd->n_lambda > 0))
 +    real   store_energy=0;
 +    real   tmp;
 +
 +    /* Do NOT use the box in the state variable, but the separate box provided
 +     * as an argument. This is because we sometimes need to write the box from
 +     * the last timestep to match the trajectory frames.
 +     */
 +    copy_energy(md, enerd->term,ecopy);
 +    add_ebin(md->ebin,md->ie,md->f_nre,ecopy,bSum);
 +    if (md->nCrmsd)
 +    {
 +        crmsd[0] = constr_rmsd(constr,FALSE);
 +        if (md->nCrmsd > 1)
 +        {
 +            crmsd[1] = constr_rmsd(constr,TRUE);
 +        }
 +        add_ebin(md->ebin,md->iconrmsd,md->nCrmsd,crmsd,FALSE);
 +    }
 +    if (md->bDynBox)
 +    {
 +        int nboxs;
 +        if(md->bTricl)
 +        {
 +            bs[0] = box[XX][XX];
 +            bs[1] = box[YY][YY];
 +            bs[2] = box[ZZ][ZZ];
 +            bs[3] = box[YY][XX];
 +            bs[4] = box[ZZ][XX];
 +            bs[5] = box[ZZ][YY];
 +            nboxs=NTRICLBOXS;
 +        }
 +        else
 +        {
 +            bs[0] = box[XX][XX];
 +            bs[1] = box[YY][YY];
 +            bs[2] = box[ZZ][ZZ];
 +            nboxs=NBOXS;
 +        }
 +        vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +        dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
 +        add_ebin(md->ebin,md->ib   ,nboxs,bs   ,bSum);
 +        add_ebin(md->ebin,md->ivol ,1    ,&vol ,bSum);
 +        add_ebin(md->ebin,md->idens,1    ,&dens,bSum);
 +
 +        if (md->bDiagPres)
 +        {
 +            /* This is pV (in kJ/mol).  The pressure is the reference pressure,
 +               not the instantaneous pressure */
 +            pv = vol*md->ref_p/PRESFAC;
 +
 +            add_ebin(md->ebin,md->ipv  ,1    ,&pv  ,bSum);
 +            enthalpy = pv + enerd->term[F_ETOT];
 +            add_ebin(md->ebin,md->ienthalpy  ,1    ,&enthalpy  ,bSum);
 +        }
 +    }
 +    if (md->bConstrVir)
 +    {
 +        add_ebin(md->ebin,md->isvir,9,svir[0],bSum);
 +        add_ebin(md->ebin,md->ifvir,9,fvir[0],bSum);
 +    }
 +    if (md->bVir)
 +        add_ebin(md->ebin,md->ivir,9,vir[0],bSum);
 +    if (md->bPress)
 +        add_ebin(md->ebin,md->ipres,9,pres[0],bSum);
 +    if (md->bSurft){
 +        tmp = (pres[ZZ][ZZ]-(pres[XX][XX]+pres[YY][YY])*0.5)*box[ZZ][ZZ];
 +        add_ebin(md->ebin,md->isurft,1,&tmp,bSum);
 +    }
 +    if (md->epc == epcPARRINELLORAHMAN || md->epc == epcMTTK)
 +    {
 +        tmp6[0] = state->boxv[XX][XX];
 +        tmp6[1] = state->boxv[YY][YY];
 +        tmp6[2] = state->boxv[ZZ][ZZ];
 +        tmp6[3] = state->boxv[YY][XX];
 +        tmp6[4] = state->boxv[ZZ][XX];
 +        tmp6[5] = state->boxv[ZZ][YY];
 +        add_ebin(md->ebin,md->ipc,md->bTricl ? 6 : 3,tmp6,bSum);
 +    }
 +    if (md->bMu)
 +    {
 +        add_ebin(md->ebin,md->imu,3,mu_tot,bSum);
 +    }
 +    if (ekind && ekind->cosacc.cos_accel != 0)
 +    {
 +        vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +        dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
 +        add_ebin(md->ebin,md->ivcos,1,&(ekind->cosacc.vcos),bSum);
 +        /* 1/viscosity, unit 1/(kg m^-1 s^-1) */
 +        tmp = 1/(ekind->cosacc.cos_accel/(ekind->cosacc.vcos*PICO)
 +                 *dens*vol*sqr(box[ZZ][ZZ]*NANO/(2*M_PI)));
 +        add_ebin(md->ebin,md->ivisc,1,&tmp,bSum);
 +    }
 +    if (md->nE > 1)
 +    {
 +        n=0;
 +        for(i=0; (i<md->nEg); i++)
 +        {
 +            for(j=i; (j<md->nEg); j++)
 +            {
 +                gid=GID(i,j,md->nEg);
 +                for(k=kk=0; (k<egNR); k++)
 +                {
 +                    if (md->bEInd[k])
 +                    {
 +                        eee[kk++] = enerd->grpp.ener[k][gid];
 +                    }
 +                }
 +                add_ebin(md->ebin,md->igrp[n],md->nEc,eee,bSum);
 +                n++;
 +            }
 +        }
 +    }
 +
 +    if (ekind)
 +    {
 +        for(i=0; (i<md->nTC); i++)
 +        {
 +            md->tmp_r[i] = ekind->tcstat[i].T;
 +        }
 +        add_ebin(md->ebin,md->itemp,md->nTC,md->tmp_r,bSum);
 +
 +        if (md->etc == etcNOSEHOOVER)
 +        {
 +            /* whether to print Nose-Hoover chains: */
 +            if (md->bPrintNHChains)
 +            {
 +                if (md->bNHC_trotter)
 +                {
 +                    for(i=0; (i<md->nTC); i++)
 +                    {
 +                        for (j=0;j<md->nNHC;j++)
 +                        {
 +                            k = i*md->nNHC+j;
 +                            md->tmp_r[2*k] = state->nosehoover_xi[k];
 +                            md->tmp_r[2*k+1] = state->nosehoover_vxi[k];
 +                        }
 +                    }
 +                    add_ebin(md->ebin,md->itc,md->mde_n,md->tmp_r,bSum);
 +
 +                    if (md->bMTTK) {
 +                        for(i=0; (i<md->nTCP); i++)
 +                        {
 +                            for (j=0;j<md->nNHC;j++)
 +                            {
 +                                k = i*md->nNHC+j;
 +                                md->tmp_r[2*k] = state->nhpres_xi[k];
 +                                md->tmp_r[2*k+1] = state->nhpres_vxi[k];
 +                            }
 +                        }
 +                        add_ebin(md->ebin,md->itcb,md->mdeb_n,md->tmp_r,bSum);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; (i<md->nTC); i++)
 +                    {
 +                        md->tmp_r[2*i] = state->nosehoover_xi[i];
 +                        md->tmp_r[2*i+1] = state->nosehoover_vxi[i];
 +                    }
 +                    add_ebin(md->ebin,md->itc,md->mde_n,md->tmp_r,bSum);
 +                }
 +            }
 +        }
 +        else if (md->etc == etcBERENDSEN || md->etc == etcYES ||
 +                 md->etc == etcVRESCALE)
 +        {
 +            for(i=0; (i<md->nTC); i++)
 +            {
 +                md->tmp_r[i] = ekind->tcstat[i].lambda;
 +            }
 +            add_ebin(md->ebin,md->itc,md->nTC,md->tmp_r,bSum);
 +        }
 +    }
 +
 +    if (ekind && md->nU > 1)
 +    {
 +        for(i=0; (i<md->nU); i++)
 +        {
 +            copy_rvec(ekind->grpstat[i].u,md->tmp_v[i]);
 +        }
 +        add_ebin(md->ebin,md->iu,3*md->nU,md->tmp_v[0],bSum);
 +    }
 +
 +    ebin_increase_count(md->ebin,bSum);
 +
 +    /* BAR + thermodynamic integration values */
-         snew(dE,enerd->n_lambda-1);
++    if ((md->fp_dhdl || md->dhc) && bDoDHDL)
 +    {
-             dE[i] = enerd->enerpart_lambda[i+1]-enerd->enerpart_lambda[0];  /* zero for simulated tempering */
 +        for(i=0; i<enerd->n_lambda-1; i++) {
-                 dE[i] += (md->temperatures[i]/md->temperatures[state->fep_state]-1.0)*enerd->term[F_EKIN];
++            /* zero for simulated tempering */
++            md->dE[i] = enerd->enerpart_lambda[i+1]-enerd->enerpart_lambda[0];
 +            if (md->temperatures!=NULL)
 +            {
 +                /* MRS: is this right, given the way we have defined the exchange probabilities? */
 +                /* is this even useful to have at all? */
-     }
-     if (md->fp_dhdl && bDoDHDL)
-     {
-         fprintf(md->fp_dhdl,"%.4f",time);
-         /* the current free energy state */
++                md->dE[i] += (md->temperatures[i]/
++                          md->temperatures[state->fep_state]-1.0)*
++                            enerd->term[F_EKIN];
 +            }
 +        }
-         /* print the current state if we are doing expanded ensemble */
-         if (expand->elmcmove > elmcmoveNO) {
-             fprintf(md->fp_dhdl," %4d",state->fep_state);
-         }
-         /* total energy (for if the temperature changes */
-         if (fep->bPrintEnergy)
 +
-             store_energy = enerd->term[F_ETOT];
-             fprintf(md->fp_dhdl," %#.8g",store_energy);
-         }
++        if (md->fp_dhdl)
 +        {
-         for (i=0;i<efptNR;i++)
-         {
-             if (fep->separate_dvdl[i])
++            fprintf(md->fp_dhdl,"%.4f",time);
++            /* the current free energy state */
 +
-                 fprintf(md->fp_dhdl," %#.8g",enerd->term[F_DVDL+i]); /* assumes F_DVDL is first */
++            /* print the current state if we are doing expanded ensemble */
++            if (expand->elmcmove > elmcmoveNO) {
++                fprintf(md->fp_dhdl," %4d",state->fep_state);
++            }
++            /* total energy (for if the temperature changes */
++            if (fep->bPrintEnergy)
 +            {
-         }
-         for(i=1; i<enerd->n_lambda; i++)
-         {
-             fprintf(md->fp_dhdl," %#.8g",dE[i-1]);
++                store_energy = enerd->term[F_ETOT];
++                fprintf(md->fp_dhdl," %#.8g",store_energy);
 +            }
-         if ((md->epc!=epcNO)  && (enerd->n_lambda > 0))
-         {
-             fprintf(md->fp_dhdl," %#.8g",pv);   /* PV term only needed when there are alternate state lambda */
-         }
-         fprintf(md->fp_dhdl,"\n");
-         /* and the binary free energy output */
-     }
-     if (md->dhc && bDoDHDL)
-     {
-         int idhdl = 0;
-         for (i=0;i<efptNR;i++)
 +
++            if (fep->dhdl_derivatives == edhdlderivativesYES)
++            {
++                for (i=0;i<efptNR;i++)
++                {
++                    if (fep->separate_dvdl[i])
++                    {
++                        /* assumes F_DVDL is first */
++                        fprintf(md->fp_dhdl," %#.8g",enerd->term[F_DVDL+i]);
++                    }
++                }
++            }
++            for(i=fep->lambda_start_n;i<fep->lambda_stop_n;i++)
++            {
++                fprintf(md->fp_dhdl," %#.8g",md->dE[i]);
++            }
++            if ((md->epc!=epcNO)  && 
++                (enerd->n_lambda > 0) &&
++                (fep->init_lambda<0))
++            {
++                fprintf(md->fp_dhdl," %#.8g",pv);  /* PV term only needed when
++                                                      there are alternate state
++                                                      lambda and we're not in
++                                                      compatibility mode */
++            }
++            fprintf(md->fp_dhdl,"\n");
++            /* and the binary free energy output */
 +        }
-             if (fep->separate_dvdl[i])
++        if (md->dhc && bDoDHDL)
 +        {
-                 store_dhdl[idhdl] = enerd->term[F_DVDL+i]; /* assumes F_DVDL is first */
-                 idhdl+=1;
++            int idhdl = 0;
++            for (i=0;i<efptNR;i++)
 +            {
-         /* store_dh is dE */
-         mde_delta_h_coll_add_dh(md->dhc,
-                                 (double)state->fep_state,
-                                 store_energy,
-                                 pv,
-                                 (expand->elamstats>elamstatsNO),
-                                 (fep->bPrintEnergy),
-                                 (md->epc!=epcNO),
-                                 idhdl,
-                                 fep->n_lambda,
-                                 store_dhdl,
-                                 dE,
-                                 time);
-     }
-     if ((md->fp_dhdl || md->dhc) && bDoDHDL && (enerd->n_lambda >0))
-     {
-         sfree(dE);
++                if (fep->separate_dvdl[i])
++                {
++                    /* assumes F_DVDL is first */
++                    store_dhdl[idhdl] = enerd->term[F_DVDL+i];
++                    idhdl+=1;
++                }
 +            }
++            store_energy = enerd->term[F_ETOT];
++            /* store_dh is dE */
++            mde_delta_h_coll_add_dh(md->dhc,
++                                    (double)state->fep_state,
++                                    store_energy,
++                                    pv,
++                                    store_dhdl,
++                                    md->dE + fep->lambda_start_n,
++                                    time);
 +        }
 +    }
 +}
 +
 +
 +void upd_mdebin_step(t_mdebin *md)
 +{
 +    ebin_increase_count(md->ebin,FALSE);
 +}
 +
 +static void npr(FILE *log,int n,char c)
 +{
 +    for(; (n>0); n--) fprintf(log,"%c",c);
 +}
 +
 +static void pprint(FILE *log,const char *s,t_mdebin *md)
 +{
 +    char CHAR='#';
 +    int  slen;
 +    char buf1[22],buf2[22];
 +
 +    slen = strlen(s);
 +    fprintf(log,"\t<======  ");
 +    npr(log,slen,CHAR);
 +    fprintf(log,"  ==>\n");
 +    fprintf(log,"\t<====  %s  ====>\n",s);
 +    fprintf(log,"\t<==  ");
 +    npr(log,slen,CHAR);
 +    fprintf(log,"  ======>\n\n");
 +
 +    fprintf(log,"\tStatistics over %s steps using %s frames\n",
 +            gmx_step_str(md->ebin->nsteps_sim,buf1),
 +            gmx_step_str(md->ebin->nsum_sim,buf2));
 +    fprintf(log,"\n");
 +}
 +
 +void print_ebin_header(FILE *log,gmx_large_int_t steps,double time,real lambda)
 +{
 +    char buf[22];
 +
 +    fprintf(log,"   %12s   %12s   %12s\n"
 +            "   %12s   %12.5f   %12.5f\n\n",
 +            "Step","Time","Lambda",gmx_step_str(steps,buf),time,lambda);
 +}
 +
 +void print_ebin(ener_file_t fp_ene,gmx_bool bEne,gmx_bool bDR,gmx_bool bOR,
 +                FILE *log,
 +                gmx_large_int_t step,double time,
 +                int mode,gmx_bool bCompact,
 +                t_mdebin *md,t_fcdata *fcd,
 +                gmx_groups_t *groups,t_grpopts *opts)
 +{
 +    /*static char **grpnms=NULL;*/
 +    char        buf[246];
 +    int         i,j,n,ni,nj,ndr,nor,b;
 +    int         ndisre=0;
 +    real        *disre_rm3tav, *disre_rt;
 +
 +    /* these are for the old-style blocks (1 subblock, only reals), because
 +       there can be only one per ID for these */
 +    int         nr[enxNR];
 +    int         id[enxNR];
 +    real        *block[enxNR];
 +
 +    /* temporary arrays for the lambda values to write out */
 +    double      enxlambda_data[2];
 +
 +    t_enxframe  fr;
 +
 +    switch (mode)
 +    {
 +        case eprNORMAL:
 +            init_enxframe(&fr);
 +            fr.t            = time;
 +            fr.step         = step;
 +            fr.nsteps       = md->ebin->nsteps;
 +            fr.dt           = md->delta_t;
 +            fr.nsum         = md->ebin->nsum;
 +            fr.nre          = (bEne) ? md->ebin->nener : 0;
 +            fr.ener         = md->ebin->e;
 +            ndisre          = bDR ? fcd->disres.npair : 0;
 +            disre_rm3tav    = fcd->disres.rm3tav;
 +            disre_rt        = fcd->disres.rt;
 +            /* Optional additional old-style (real-only) blocks. */
 +            for(i=0; i<enxNR; i++)
 +            {
 +                nr[i] = 0;
 +            }
 +            if (fcd->orires.nr > 0 && bOR)
 +            {
 +                diagonalize_orires_tensors(&(fcd->orires));
 +                nr[enxOR]     = fcd->orires.nr;
 +                block[enxOR]  = fcd->orires.otav;
 +                id[enxOR]     = enxOR;
 +                nr[enxORI]    = (fcd->orires.oinsl != fcd->orires.otav) ?
 +                          fcd->orires.nr : 0;
 +                block[enxORI] = fcd->orires.oinsl;
 +                id[enxORI]    = enxORI;
 +                nr[enxORT]    = fcd->orires.nex*12;
 +                block[enxORT] = fcd->orires.eig;
 +                id[enxORT]    = enxORT;
 +            }
 +
 +            /* whether we are going to wrte anything out: */
 +            if (fr.nre || ndisre || nr[enxOR] || nr[enxORI])
 +            {
 +
 +                /* the old-style blocks go first */
 +                fr.nblock = 0;
 +                for(i=0; i<enxNR; i++)
 +                {
 +                    if (nr[i] > 0)
 +                    {
 +                        fr.nblock = i + 1;
 +                    }
 +                }
 +                add_blocks_enxframe(&fr, fr.nblock);
 +                for(b=0;b<fr.nblock;b++)
 +                {
 +                    add_subblocks_enxblock(&(fr.block[b]), 1);
 +                    fr.block[b].id=id[b];
 +                    fr.block[b].sub[0].nr = nr[b];
 +#ifndef GMX_DOUBLE
 +                    fr.block[b].sub[0].type = xdr_datatype_float;
 +                    fr.block[b].sub[0].fval = block[b];
 +#else
 +                    fr.block[b].sub[0].type = xdr_datatype_double;
 +                    fr.block[b].sub[0].dval = block[b];
 +#endif
 +                }
 +
 +                /* check for disre block & fill it. */
 +                if (ndisre>0)
 +                {
 +                    int db = fr.nblock;
 +                    fr.nblock+=1;
 +                    add_blocks_enxframe(&fr, fr.nblock);
 +
 +                    add_subblocks_enxblock(&(fr.block[db]), 2);
 +                    fr.block[db].id=enxDISRE;
 +                    fr.block[db].sub[0].nr=ndisre;
 +                    fr.block[db].sub[1].nr=ndisre;
 +#ifndef GMX_DOUBLE
 +                    fr.block[db].sub[0].type=xdr_datatype_float;
 +                    fr.block[db].sub[1].type=xdr_datatype_float;
 +                    fr.block[db].sub[0].fval=disre_rt;
 +                    fr.block[db].sub[1].fval=disre_rm3tav;
 +#else
 +                    fr.block[db].sub[0].type=xdr_datatype_double;
 +                    fr.block[db].sub[1].type=xdr_datatype_double;
 +                    fr.block[db].sub[0].dval=disre_rt;
 +                    fr.block[db].sub[1].dval=disre_rm3tav;
 +#endif
 +                }
 +                /* here we can put new-style blocks */
 +
 +                /* Free energy perturbation blocks */
 +                if (md->dhc)
 +                {
 +                    mde_delta_h_coll_handle_block(md->dhc, &fr, fr.nblock);
 +                }
 +
 +                /* we can now free & reset the data in the blocks */
 +                if (md->dhc)
 +                {
 +                    mde_delta_h_coll_reset(md->dhc);
 +                }
 +
 +                /* do the actual I/O */
 +                do_enx(fp_ene,&fr);
 +                gmx_fio_check_file_position(enx_file_pointer(fp_ene));
 +                if (fr.nre)
 +                {
 +                    /* We have stored the sums, so reset the sum history */
 +                    reset_ebin_sums(md->ebin);
 +                }
 +            }
 +            free_enxframe(&fr);
 +            break;
 +        case eprAVER:
 +            if (log)
 +            {
 +                pprint(log,"A V E R A G E S",md);
 +            }
 +            break;
 +        case eprRMS:
 +            if (log)
 +            {
 +                pprint(log,"R M S - F L U C T U A T I O N S",md);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Invalid print mode (%d)",mode);
 +    }
 +
 +    if (log)
 +    {
 +        for(i=0;i<opts->ngtc;i++)
 +        {
 +            if(opts->annealing[i]!=eannNO)
 +            {
 +                fprintf(log,"Current ref_t for group %s: %8.1f\n",
 +                        *(groups->grpname[groups->grps[egcTC].nm_ind[i]]),
 +                        opts->ref_t[i]);
 +            }
 +        }
 +        if (mode==eprNORMAL && fcd->orires.nr>0)
 +        {
 +            print_orires_log(log,&(fcd->orires));
 +        }
 +        fprintf(log,"   Energies (%s)\n",unit_energy);
 +        pr_ebin(log,md->ebin,md->ie,md->f_nre+md->nCrmsd,5,mode,TRUE);
 +        fprintf(log,"\n");
 +
 +        if (!bCompact)
 +        {
 +            if (md->bDynBox)
 +            {
 +                pr_ebin(log,md->ebin,md->ib, md->bTricl ? NTRICLBOXS : NBOXS,5,
 +                        mode,TRUE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bConstrVir)
 +            {
 +                fprintf(log,"   Constraint Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->isvir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +                fprintf(log,"   Force Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->ifvir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bVir)
 +            {
 +                fprintf(log,"   Total Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->ivir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bPress)
 +            {
 +                fprintf(log,"   Pressure (%s)\n",unit_pres_bar);
 +                pr_ebin(log,md->ebin,md->ipres,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bMu)
 +            {
 +                fprintf(log,"   Total Dipole (%s)\n",unit_dipole_D);
 +                pr_ebin(log,md->ebin,md->imu,3,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +
 +            if (md->nE > 1)
 +            {
 +                if (md->print_grpnms==NULL)
 +                {
 +                    snew(md->print_grpnms,md->nE);
 +                    n=0;
 +                    for(i=0; (i<md->nEg); i++)
 +                    {
 +                        ni=groups->grps[egcENER].nm_ind[i];
 +                        for(j=i; (j<md->nEg); j++)
 +                        {
 +                            nj=groups->grps[egcENER].nm_ind[j];
 +                            sprintf(buf,"%s-%s",*(groups->grpname[ni]),
 +                                    *(groups->grpname[nj]));
 +                            md->print_grpnms[n++]=strdup(buf);
 +                        }
 +                    }
 +                }
 +                sprintf(buf,"Epot (%s)",unit_energy);
 +                fprintf(log,"%15s   ",buf);
 +                for(i=0; (i<egNR); i++)
 +                {
 +                    if (md->bEInd[i])
 +                    {
 +                        fprintf(log,"%12s   ",egrp_nm[i]);
 +                    }
 +                }
 +                fprintf(log,"\n");
 +                for(i=0; (i<md->nE); i++)
 +                {
 +                    fprintf(log,"%15s",md->print_grpnms[i]);
 +                    pr_ebin(log,md->ebin,md->igrp[i],md->nEc,md->nEc,mode,
 +                            FALSE);
 +                }
 +                fprintf(log,"\n");
 +            }
 +            if (md->nTC > 1)
 +            {
 +                pr_ebin(log,md->ebin,md->itemp,md->nTC,4,mode,TRUE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->nU > 1)
 +            {
 +                fprintf(log,"%15s   %12s   %12s   %12s\n",
 +                        "Group","Ux","Uy","Uz");
 +                for(i=0; (i<md->nU); i++)
 +                {
 +                    ni=groups->grps[egcACC].nm_ind[i];
 +                    fprintf(log,"%15s",*groups->grpname[ni]);
 +                    pr_ebin(log,md->ebin,md->iu+3*i,3,3,mode,FALSE);
 +                }
 +                fprintf(log,"\n");
 +            }
 +        }
 +    }
 +
 +}
 +
 +void update_energyhistory(energyhistory_t * enerhist,t_mdebin * mdebin)
 +{
 +    int i;
 +
 +    enerhist->nsteps     = mdebin->ebin->nsteps;
 +    enerhist->nsum       = mdebin->ebin->nsum;
 +    enerhist->nsteps_sim = mdebin->ebin->nsteps_sim;
 +    enerhist->nsum_sim   = mdebin->ebin->nsum_sim;
 +    enerhist->nener      = mdebin->ebin->nener;
 +
 +    if (mdebin->ebin->nsum > 0)
 +    {
 +        /* Check if we need to allocate first */
 +        if(enerhist->ener_ave == NULL)
 +        {
 +            snew(enerhist->ener_ave,enerhist->nener);
 +            snew(enerhist->ener_sum,enerhist->nener);
 +        }
 +
 +        for(i=0;i<enerhist->nener;i++)
 +        {
 +            enerhist->ener_ave[i] = mdebin->ebin->e[i].eav;
 +            enerhist->ener_sum[i] = mdebin->ebin->e[i].esum;
 +        }
 +    }
 +
 +    if (mdebin->ebin->nsum_sim > 0)
 +    {
 +        /* Check if we need to allocate first */
 +        if(enerhist->ener_sum_sim == NULL)
 +        {
 +            snew(enerhist->ener_sum_sim,enerhist->nener);
 +        }
 +
 +        for(i=0;i<enerhist->nener;i++)
 +        {
 +            enerhist->ener_sum_sim[i] = mdebin->ebin->e_sim[i].esum;
 +        }
 +    }
 +    if (mdebin->dhc)
 +    {
 +        mde_delta_h_coll_update_energyhistory(mdebin->dhc, enerhist);
 +    }
 +}
 +
 +void restore_energyhistory_from_state(t_mdebin * mdebin,
 +                                      energyhistory_t * enerhist)
 +{
 +    int i;
 +
 +    if ((enerhist->nsum > 0 || enerhist->nsum_sim > 0) &&
 +        mdebin->ebin->nener != enerhist->nener)
 +    {
 +        gmx_fatal(FARGS,"Mismatch between number of energies in run input (%d) and checkpoint file (%d).",
 +                  mdebin->ebin->nener,enerhist->nener);
 +    }
 +
 +    mdebin->ebin->nsteps     = enerhist->nsteps;
 +    mdebin->ebin->nsum       = enerhist->nsum;
 +    mdebin->ebin->nsteps_sim = enerhist->nsteps_sim;
 +    mdebin->ebin->nsum_sim   = enerhist->nsum_sim;
 +
 +    for(i=0; i<mdebin->ebin->nener; i++)
 +    {
 +        mdebin->ebin->e[i].eav  =
 +                  (enerhist->nsum > 0 ? enerhist->ener_ave[i] : 0);
 +        mdebin->ebin->e[i].esum =
 +                  (enerhist->nsum > 0 ? enerhist->ener_sum[i] : 0);
 +        mdebin->ebin->e_sim[i].esum =
 +                  (enerhist->nsum_sim > 0 ? enerhist->ener_sum_sim[i] : 0);
 +    }
 +    if (mdebin->dhc)
 +    {
 +        mde_delta_h_coll_restore_energyhistory(mdebin->dhc, enerhist);
 +    }
 +}
index f496f68b30e41d9d32a645beb5fed8c286ac4051,0000000000000000000000000000000000000000..7bd56bd9adec8b543a72215b0e617f70ca864904
mode 100644,000000..100644
--- /dev/null
@@@ -1,520 -1,0 +1,733 @@@
-                              double dx, unsigned int  ndhmax)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <float.h>
 +#include <math.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "gmx_fatal.h"
 +#include "mdebin.h"
 +#include "smalloc.h"
 +#include "enxio.h"
 +#include "gmxfio.h"
 +#include "mdebin_bar.h"
 +
 +/* reset the delta_h list to prepare it for new values */
 +static void mde_delta_h_reset(t_mde_delta_h *dh)
 +{
 +    dh->ndh=0;
 +    dh->written=FALSE;
 +}
 +
 +/* initialize the delta_h list */
 +static void mde_delta_h_init(t_mde_delta_h *dh, int nbins,
-         unsigned int i;
++                             double dx, unsigned int  ndhmax,
++                             int type, int derivative, int nlambda,
++                             double *lambda)
 +{
 +    int i;
 +
++    dh->type=type;
++    dh->derivative=derivative;
++    dh->lambda=lambda;
++    dh->nlambda=nlambda;
++
++    snew(dh->lambda, nlambda);
++    for(i=0;i<nlambda;i++)
++    {
++        dh->lambda[i] = lambda[i];
++    }
++
++
++    snew(dh->subblock_meta_d, dh->nlambda+1);
++
 +    dh->ndhmax=ndhmax+2;
 +    for(i=0;i<2;i++)
 +    {
 +        dh->bin[i]=NULL;
 +    }
 +
 +    snew(dh->dh, dh->ndhmax);
 +    snew(dh->dhf,dh->ndhmax);
 +
 +    if ( nbins <= 0 || dx<GMX_REAL_EPS*10 )
 +    {
 +        dh->nhist=0;
 +    }
 +    else
 +    {
 +        int i;
 +        /* pre-allocate the histogram */
 +        dh->nhist=2; /* energies and derivatives histogram */
 +        dh->dx=dx;
 +        dh->nbins=nbins;
 +        for(i=0;i<dh->nhist;i++)
 +        {
 +            snew(dh->bin[i], dh->nbins);
 +        }
 +    }
 +    mde_delta_h_reset(dh);
 +}
 +
 +/* Add a value to the delta_h list */
 +static void mde_delta_h_add_dh(t_mde_delta_h *dh, double delta_h, double time)
 +{
 +    if (dh->ndh >= dh->ndhmax)
 +    {
 +        gmx_incons("delta_h array not big enough!");
 +    }
 +    dh->dh[dh->ndh]=delta_h;
 +    dh->ndh++;
 +}
 +
 +/* construct histogram with index hi */
 +static void mde_delta_h_make_hist(t_mde_delta_h *dh, int hi, gmx_bool invert)
 +{
 +    double min_dh = FLT_MAX;
 +    double max_dh = -FLT_MAX;
 +    unsigned int i;
 +    double max_dh_hist; /* maximum binnable dh value */
 +    double min_dh_hist; /* minimum binnable dh value */
 +    double dx=dh->dx;
 +    double f; /* energy mult. factor */
 +
 +    /* by applying a -1 scaling factor on the energies we get the same as
 +       having a negative dx, but we don't need to fix the min/max values
 +       beyond inverting x0 */
 +    f=invert ? -1 : 1;
 +
 +    /* first find min and max */
 +    for(i=0;i<dh->ndh;i++)
 +    {
 +        if (f*dh->dh[i] < min_dh)
 +            min_dh=f*dh->dh[i];
 +        if (f*dh->dh[i] > max_dh)
 +            max_dh=f*dh->dh[i];
 +    }
 +
 +    /* reset the histogram */
 +    for(i=0;i<dh->nbins;i++)
 +    {
 +        dh->bin[hi][i]=0;
 +    }
 +    dh->maxbin[hi]=0;
 +
 +    /* The starting point of the histogram is the lowest value found:
 +       that value has the highest contribution to the free energy.
 +
 +       Get this start value in number of histogram dxs from zero,
 +       as an integer.*/
 +
 +    dh->x0[hi] = (gmx_large_int_t)floor(min_dh/dx);
 +
 +    min_dh_hist=(dh->x0[hi])*dx;
 +    max_dh_hist=(dh->x0[hi] + dh->nbins + 1)*dx;
 +
 +    /* and fill the histogram*/
 +    for(i=0;i<dh->ndh;i++)
 +    {
 +        unsigned int bin;
 +
 +        /* Determine the bin number. If it doesn't fit into the histogram,
 +           add it to the last bin.
 +           We check the max_dh_int range because converting to integers
 +           might lead to overflow with unpredictable results.*/
 +        if ( (f*dh->dh[i] >= min_dh_hist) && (f*dh->dh[i] <= max_dh_hist ) )
 +        {
 +            bin = (unsigned int)( (f*dh->dh[i] - min_dh_hist)/dx );
 +        }
 +        else
 +        {
 +            bin = dh->nbins-1;
 +        }
 +        /* double-check here because of possible round-off errors*/
 +        if (bin >= dh->nbins)
 +        {
 +            bin = dh->nbins-1;
 +        }
 +        if (bin > dh->maxbin[hi])
 +        {
 +            dh->maxbin[hi] = bin;
 +        }
 +
 +        dh->bin[hi][bin]++;
 +    }
 +
 +    /* make sure we include a bin with 0 if we didn't use the full
 +       histogram width. This can then be used as an indication that
 +       all the data was binned. */
 +    if (dh->maxbin[hi] < dh->nbins-1)
 +        dh->maxbin[hi] += 1;
 +}
 +
 +
 +void mde_delta_h_handle_block(t_mde_delta_h *dh, t_enxblock *blk)
 +{
 +    /* first check which type we should use: histogram or raw data */
 +    if (dh->nhist == 0)
 +    {
-            Raw data consists of 3 subblocks: a block with the
-            the foreign lambda, and the data itself */
++        int i;
 +
 +        /* We write raw data.
-         dh->subblock_i[0]=dh->derivative ? 1 : 0; /* derivative type */
-         blk->sub[0].nr=1;
++           Raw data consists of 3 subblocks: an int metadata block
++           with type and derivative index, a foreign lambda block
++           and and the data itself */
 +        add_subblocks_enxblock(blk, 3);
 +
 +        blk->id=enxDH;
 +
 +        /* subblock 1 */
-         blk->sub[0].ival=dh->subblock_i;
++        dh->subblock_meta_i[0]=dh->type; /* block data type */
++        dh->subblock_meta_i[1]=dh->derivative; /* derivative direction if
++                                                  applicable (in indices
++                                                  starting from first coord in
++                                                  the main delta_h_coll) */
++        blk->sub[0].nr=2;
 +        blk->sub[0].type=xdr_datatype_int;
-         dh->subblock_d[0]=dh->lambda;
-         blk->sub[1].nr=1;
++        blk->sub[0].ival=dh->subblock_meta_i;
 +
 +        /* subblock 2 */
-         blk->sub[1].dval=dh->subblock_d;
++        for(i=0;i<dh->nlambda;i++)
++        {
++            dh->subblock_meta_d[i]=dh->lambda[i];
++        }
++        blk->sub[1].nr=dh->nlambda;
 +        blk->sub[1].type=xdr_datatype_double;
- #endif
++        blk->sub[1].dval=dh->subblock_meta_d;
 +
 +        /* subblock 3 */
 +        /* check if there's actual data to be written. */
 +        /*if (dh->ndh > 1)*/
 +        if (dh->ndh > 0)
 +        {
++            unsigned int i;
++
 +            blk->sub[2].nr=dh->ndh;
 +/* For F@H for now. */
 +#undef GMX_DOUBLE
 +#ifndef GMX_DOUBLE
 +            blk->sub[2].type=xdr_datatype_float;
 +            for(i=0;i<dh->ndh;i++)
 +            {
 +                dh->dhf[i] = (float)dh->dh[i];
 +            }
 +            blk->sub[2].fval=dh->dhf;
 +#else
 +            blk->sub[2].type=xdr_datatype_double;
 +            blk->sub[2].dval=dh->dh;
 +#endif
 +            dh->written=TRUE;
 +        }
 +        else
 +        {
 +            blk->sub[2].nr=0;
 +#ifndef GMX_DOUBLE
 +            blk->sub[2].type=xdr_datatype_float;
 +            blk->sub[2].fval=NULL;
 +#else
 +            blk->sub[2].type=xdr_datatype_double;
-         /* subblock 1: the foreign lambda value + the histogram spacing */
-         dh->subblock_d[0]=dh->lambda;
-         dh->subblock_d[1]=dh->dx;
-         blk->sub[0].nr=2;
 +            blk->sub[2].dval=NULL;
++#endif
 +        }
 +    }
 +    else
 +    {
 +        int nhist_written=0;
 +        int i;
++        int k;
 +
++        /* TODO histogram metadata */
 +        /* check if there's actual data to be written. */
 +        if (dh->ndh > 1)
 +        {
 +            gmx_bool prev_complete=FALSE;
 +            /* Make the histogram(s) */
 +            for(i=0;i<dh->nhist;i++)
 +            {
 +                if (!prev_complete)
 +                {
 +                    /* the first histogram is always normal, and the
 +                       second one is always reverse */
 +                    mde_delta_h_make_hist(dh, i, i==1);
 +                    nhist_written++;
 +                    /* check whether this histogram contains all data: if the
 +                       last bin is 0, it does */
 +                    if (dh->bin[i][dh->nbins-1] == 0)
 +                        prev_complete=TRUE;
 +                    if (!dh->derivative)
 +                        prev_complete=TRUE;
 +                }
 +            }
 +            dh->written=TRUE;
 +        }
 +
 +        /* A histogram consists of 2, 3 or 4 subblocks:
 +           the foreign lambda value + histogram spacing, the starting point,
 +           and the histogram data (0, 1 or 2 blocks). */
 +        add_subblocks_enxblock(blk, nhist_written+2);
 +        blk->id=enxDHHIST;
 +
-         blk->sub[0].dval=dh->subblock_d;
++        /* subblock 1: the lambda value + the histogram spacing */
++        if (dh->nlambda == 1)
++        {
++            /* for backward compatibility */
++            dh->subblock_meta_d[0]=dh->lambda[0];
++        }
++        else
++        {
++            dh->subblock_meta_d[0]=-1;
++            for(i=0;i<dh->nlambda;i++)
++            {
++                dh->subblock_meta_d[2+i]=dh->lambda[i];
++            }
++        }
++        dh->subblock_meta_d[1]=dh->dx;
++        blk->sub[0].nr = 2+ ((dh->nlambda>1) ? dh->nlambda : 0);
 +        blk->sub[0].type=xdr_datatype_double;
-         dh->subblock_l[0]=nhist_written;
-         dh->subblock_l[1]=dh->derivative ? 1 : 0;
++        blk->sub[0].dval=dh->subblock_meta_d;
 +
 +        /* subblock 2: the starting point(s) as a long integer */
-             dh->subblock_l[2+i]=dh->x0[i];
++        dh->subblock_meta_l[0]=nhist_written;
++        dh->subblock_meta_l[1]=dh->type; /*dh->derivative ? 1 : 0;*/
++        k=2;
 +        for(i=0;i<nhist_written;i++)
-         blk->sub[1].nr=nhist_written+2;
++            dh->subblock_meta_l[k++]=dh->x0[i];
++        /* append the derivative data */
++        dh->subblock_meta_l[k++]=dh->derivative;
 +
-         blk->sub[1].lval=dh->subblock_l;
++        blk->sub[1].nr=nhist_written+3;
 +        blk->sub[1].type=xdr_datatype_large_int;
-     int i;
++        blk->sub[1].lval=dh->subblock_meta_l;
 +
 +        /* subblock 3 + 4 : the histogram data */
 +        for(i=0;i<nhist_written;i++)
 +        {
 +            blk->sub[i+2].nr=dh->maxbin[i]+1; /* it's +1 because size=index+1
 +                                                 in C */
 +            blk->sub[i+2].type=xdr_datatype_int;
 +            blk->sub[i+2].ival=dh->bin[i];
 +        }
 +    }
 +}
 +
 +/* initialize the collection*/
 +void mde_delta_h_coll_init(t_mde_delta_h_coll *dhc, const t_inputrec *ir)
 +{
-     /* for continuous change of lambda values */
++    int i,j,n;
 +    double lambda;
++    double *lambda_vec;
 +    int ndhmax=ir->nstenergy/ir->nstcalcenergy;
++    t_lambda *fep=ir->fepvals;
 +
 +    dhc->temperature=ir->opts.ref_t[0];  /* only store system temperature */
 +    dhc->start_time=0.;
 +    dhc->delta_time=ir->delta_t*ir->fepvals->nstdhdl;
 +    dhc->start_time_set=FALSE;
 +
-     /* total number of raw data points in the sample */
-     dhc->ndh = 0;
-     /* include one more for the specification of the state, by lambda or fep_state, store as double for now*/
-     if (ir->expandedvals->elamstats > elamstatsNO) {
-         dhc->ndh +=1;
++    /* this is the compatibility lambda value. If it is >=0, it is valid,
++       and there is either an old-style lambda or a slow growth simulation. */
 +    dhc->start_lambda=ir->fepvals->init_lambda;
++    /* for continuous change of lambda values */
 +    dhc->delta_lambda=ir->fepvals->delta_lambda*ir->fepvals->nstdhdl;
 +
-     /* whether to print energies */
-     if (ir->fepvals->bPrintEnergy) {
-         dhc->ndh += 1;
++    if (dhc->start_lambda < 0)
++    {
++        /* create the native lambda vectors */
++        dhc->lambda_index=fep->init_fep_state;
++        dhc->n_lambda_vec=0;
++        for(i=0;i<efptNR;i++)
++        {
++            if (fep->separate_dvdl[i])
++            {
++                dhc->n_lambda_vec++;
++            }
++        }
++        snew(dhc->native_lambda_vec, dhc->n_lambda_vec);
++        snew(dhc->native_lambda_components, dhc->n_lambda_vec);
++        j=0;
++        for(i=0;i<efptNR;i++)
++        {
++            if (fep->separate_dvdl[i])
++            {
++                dhc->native_lambda_components[j]=i;
++                if (fep->init_fep_state >=0 &&
++                    fep->init_fep_state < fep->n_lambda)
++                {
++                    dhc->native_lambda_vec[j]=
++                                fep->all_lambda[i][fep->init_fep_state];
++                }
++                else
++                {
++                    dhc->native_lambda_vec[j]=-1;
++                }
++                j++;
++            }
++        }
 +    }
-     /* add the dhdl's */
-     for (i=0;i<efptNR;i++)
++    else
++    {
++        /* don't allocate the meta-data subblocks for lambda vectors */
++        dhc->native_lambda_vec=NULL;
++        dhc->n_lambda_vec=0;
++        dhc->native_lambda_components=0;
++        dhc->lambda_index=-1;
 +    }
++    /* allocate metadata subblocks */
++    snew(dhc->subblock_d, 5 + dhc->n_lambda_vec);
++    snew(dhc->subblock_i, 1 + dhc->n_lambda_vec);
++
++    /* now decide which data to write out */
++    dhc->nlambda=0;
++    dhc->ndhdl=0;
++    dhc->dh_expanded=NULL;
++    dhc->dh_energy=NULL;
++    dhc->dh_pv=NULL;
++
++    /* total number of raw data point collections in the sample */
++    dhc->ndh = 0;
 +
-         if (ir->fepvals->separate_dvdl[i])
 +    {
-             dhc->ndh+=1;
++        gmx_bool bExpanded=FALSE;
++        gmx_bool bEnergy=FALSE;
++        gmx_bool bPV=FALSE;
++        int n_lambda_components=0;
++
++        /* first count the number of states */
++
++        /* add the dhdl's */
++        if (fep->dhdl_derivatives == edhdlderivativesYES)
 +        {
-     }
++            for (i=0;i<efptNR;i++)
++            {
++                if (ir->fepvals->separate_dvdl[i])
++                {
++                    dhc->ndh+=1;
++                    dhc->ndhdl+=1;
++                }
++            }
 +        }
-     /* add the lambdas */
-     dhc->ndh += ir->fepvals->n_lambda;
++        /* add the lambdas */
++        dhc->nlambda = ir->fepvals->lambda_stop_n - ir->fepvals->lambda_start_n;
++        dhc->ndh += dhc->nlambda;
++        /* another compatibility check */
++        if (dhc->start_lambda < 0)
++        {
++            /* include one more for the specification of the state, by lambda or
++               fep_state*/
++            if (ir->expandedvals->elmcmove > elmcmoveNO) {
++                dhc->ndh +=1;
++                bExpanded=TRUE;
++            }
++            /* whether to print energies */
++            if (ir->fepvals->bPrintEnergy) {
++                dhc->ndh += 1;
++                bEnergy=TRUE;
++            }
++            if (ir->epc > epcNO) {
++                dhc->ndh += 1;  /* include pressure-volume work */
++                bPV=TRUE;
++            }
++        }
++        /* allocate them */
++        snew(dhc->dh, dhc->ndh);
++
++        /* now initialize them */
++        /* the order, for now, must match that of the dhdl.xvg file because of
++           how g_energy -odh is implemented */
++        n=0;
++        if (bExpanded)
++        {
++            dhc->dh_expanded=dhc->dh+n;
++            mde_delta_h_init(dhc->dh+n, ir->fepvals->dh_hist_size,
++                             ir->fepvals->dh_hist_spacing, ndhmax,
++                             dhbtEXPANDED, 0, 0, NULL);
++            n++;
++        }
++        if (bEnergy)
++        {
++            dhc->dh_energy=dhc->dh+n;
++            mde_delta_h_init(dhc->dh+n, ir->fepvals->dh_hist_size,
++                             ir->fepvals->dh_hist_spacing, ndhmax,
++                             dhbtEN, 0, 0, NULL);
++            n++;
++        }
++        /* add the dhdl's */
++        n_lambda_components=0;
++        if (fep->dhdl_derivatives == edhdlderivativesYES)
++        {
++            dhc->dh_dhdl = dhc->dh + n;
++            for (i=0;i<efptNR;i++)
++            {
++                if (ir->fepvals->separate_dvdl[i])
++                {
++                    /* we give it init_lambda for compatibility */
++                    mde_delta_h_init(dhc->dh+n, ir->fepvals->dh_hist_size,
++                                     ir->fepvals->dh_hist_spacing, ndhmax,
++                                     dhbtDHDL, n_lambda_components, 1,
++                                     &(fep->init_lambda));
++                    n++;
++                    n_lambda_components++;
++                }
++            }
++        }
++        else
++        {
++            for (i=0;i<efptNR;i++)
++            {
++                if (ir->fepvals->separate_dvdl[i])
++                {
++                    n_lambda_components++; /* count the components */
++                }
++            }
 +
-     if (ir->epc > epcNO) {
-         dhc->ndh += 1;  /* include pressure-volume work */
-     }
++        }
++        /* add the lambdas */
++        dhc->dh_du = dhc->dh + n;
++        snew(lambda_vec, n_lambda_components);
++        for(i=ir->fepvals->lambda_start_n;i<ir->fepvals->lambda_stop_n;i++)
++        {
++            int k=0;
 +
-     snew(dhc->dh, dhc->ndh);
-     for(i=0;i<dhc->ndh;i++)
-     {
-         mde_delta_h_init(dhc->dh+i, ir->fepvals->dh_hist_size,
-                          ir->fepvals->dh_hist_spacing, ndhmax);
++            for(j=0;j<efptNR;j++)
++            {
++                if (ir->fepvals->separate_dvdl[j])
++                {
++                    lambda_vec[k++] = fep->all_lambda[j][i];
++                }
++            }
 +
-                              int bExpanded,
-                              int bPrintEnergy,
-                              int bPressure,
-                              int ndhdl,
-                              int nlambda,
++            mde_delta_h_init(dhc->dh+n, ir->fepvals->dh_hist_size,
++                             ir->fepvals->dh_hist_spacing, ndhmax,
++                             dhbtDH, 0, n_lambda_components, lambda_vec);
++            n++;
++        }
++        sfree(lambda_vec);
++        if (bPV)
++        {
++            dhc->dh_pv=dhc->dh+n;
++            mde_delta_h_init(dhc->dh+n, ir->fepvals->dh_hist_size,
++                             ir->fepvals->dh_hist_spacing, ndhmax,
++                             dhbtPV, 0, 0, NULL);
++            n++;
++        }
 +    }
 +}
 +
 +/* add a bunch of samples - note fep_state is double to allow for better data storage */
 +void mde_delta_h_coll_add_dh(t_mde_delta_h_coll *dhc,
 +                             double fep_state,
 +                             double energy,
 +                             double pV,
-     int i,n;
 +                             double *dhdl,
 +                             double *foreign_dU,
 +                             double time)
 +{
-     n = 0;
-     if (bExpanded)
++    int i;
 +
 +    if (!dhc->start_time_set)
 +    {
 +        dhc->start_time_set=TRUE;
 +        dhc->start_time=time;
 +    }
 +
-         mde_delta_h_add_dh(dhc->dh+n,fep_state,time);
-         n++;
++    for (i=0;i<dhc->ndhdl;i++)
 +    {
-     if (bPrintEnergy)
++        mde_delta_h_add_dh(dhc->dh_dhdl+i, dhdl[i], time);
 +    }
-         mde_delta_h_add_dh(dhc->dh+n,energy,time);
-         n++;
++    for (i=0;i<dhc->nlambda;i++)
 +    {
-     for (i=0;i<ndhdl;i++)
++        mde_delta_h_add_dh(dhc->dh_du+i, foreign_dU[i], time);
 +    }
-         mde_delta_h_add_dh(dhc->dh+n, dhdl[i], time);
-         n++;
++    if (dhc->dh_pv != NULL)
 +    {
-     for (i=0;i<nlambda;i++)
++        mde_delta_h_add_dh(dhc->dh_pv, pV, time);
 +    }
-         mde_delta_h_add_dh(dhc->dh+n, foreign_dU[i], time);
-         n++;
++    if (dhc->dh_energy != NULL)
 +    {
-     if (bPressure)
++        mde_delta_h_add_dh(dhc->dh_energy,energy,time);
 +    }
-         mde_delta_h_add_dh(dhc->dh+n, pV, time);
-         n++;
++    if (dhc->dh_expanded != NULL)
 +    {
- }
++        mde_delta_h_add_dh(dhc->dh_expanded,fep_state,time);
 +    }
- /* write the data associated with all the du blocks, but not the blocks
-    themselves. Essentially, the metadata.  Or -- is this generated every time?*/
 +
-     add_subblocks_enxblock(blk, 1);
++}
 +
++/* write the metadata associated with all the du blocks, and call
++   handle_block to write out all the du blocks */
 +void mde_delta_h_coll_handle_block(t_mde_delta_h_coll *dhc,
 +                                   t_enxframe *fr, int nblock)
 +{
 +    int i;
 +    t_enxblock *blk;
 +
 +    /* add one block with one subblock as the collection's own data */
 +    nblock++;
 +    add_blocks_enxframe(fr, nblock);
 +    blk=fr->block + (nblock-1);
 +
-     dhc->subblock_d[3] = dhc->start_lambda; /* lambda at starttime */
++    /* only allocate lambda vector component blocks if they must be written out
++       for backward compatibility */
++    if (dhc->native_lambda_components!=NULL)
++    {
++        add_subblocks_enxblock(blk, 2);
++    }
++    else
++    {
++        add_subblocks_enxblock(blk, 1);
++    }
 +
 +    dhc->subblock_d[0] = dhc->temperature; /* temperature */
 +    dhc->subblock_d[1] = dhc->start_time; /* time of first sample */
 +    dhc->subblock_d[2] = dhc->delta_time; /* time difference between samples */
-     blk->sub[0].nr=5;
++    dhc->subblock_d[3] = dhc->start_lambda; /* old-style lambda at starttime */
 +    dhc->subblock_d[4] = dhc->delta_lambda; /* lambda diff. between samples */
++    /* set the lambda vector components if they exist */
++    if (dhc->native_lambda_components!=NULL)
++    {
++        for(i=0;i<dhc->n_lambda_vec;i++)
++        {
++            dhc->subblock_d[5+i] = dhc->native_lambda_vec[i];
++        }
++    }
 +    blk->id=enxDHCOLL;
++    blk->sub[0].nr=5 + dhc->n_lambda_vec;
 +    blk->sub[0].type=xdr_datatype_double;
 +    blk->sub[0].dval=dhc->subblock_d;
 +
++    if (dhc->native_lambda_components != NULL)
++    {
++        dhc->subblock_i[0] = dhc->lambda_index;
++        /* set the lambda vector component IDs if they exist */
++        dhc->subblock_i[1] = dhc->n_lambda_vec;
++        for(i=0;i<dhc->n_lambda_vec;i++)
++        {
++            dhc->subblock_i[i+2] = dhc->native_lambda_components[i];
++        }
++        blk->sub[1].nr=2 + dhc->n_lambda_vec;
++        blk->sub[1].type=xdr_datatype_int;
++        blk->sub[1].ival=dhc->subblock_i;
++    }
++
 +    for(i=0;i<dhc->ndh;i++)
 +    {
 +        nblock++;
 +        add_blocks_enxframe(fr, nblock);
 +        blk=fr->block + (nblock-1);
 +
 +        mde_delta_h_handle_block(dhc->dh+i, blk);
 +    }
 +}
 +
 +/* reset the data for a new round */
 +void mde_delta_h_coll_reset(t_mde_delta_h_coll *dhc)
 +{
 +    int i;
 +    for(i=0;i<dhc->ndh;i++)
 +    {
 +        if (dhc->dh[i].written)
 +        {
 +            /* we can now throw away the data */
 +            mde_delta_h_reset(dhc->dh + i);
 +        }
 +    }
 +    dhc->start_time_set=FALSE;
 +}
 +
 +/* set the energyhistory variables to save state */
 +void mde_delta_h_coll_update_energyhistory(t_mde_delta_h_coll *dhc,
 +                                           energyhistory_t *enerhist)
 +{
 +    int i;
 +    if (!enerhist->dht)
 +    {
 +        snew(enerhist->dht, 1);
 +        snew(enerhist->dht->ndh, dhc->ndh);
 +        snew(enerhist->dht->dh, dhc->ndh);
 +        enerhist->dht->nndh=dhc->ndh;
 +    }
 +    else
 +    {
 +        if (enerhist->dht->nndh != dhc->ndh)
 +            gmx_incons("energy history number of delta_h histograms != inputrec's number");
 +    }
 +    for(i=0;i<dhc->ndh;i++)
 +    {
 +        enerhist->dht->dh[i] = dhc->dh[i].dh;
 +        enerhist->dht->ndh[i] = dhc->dh[i].ndh;
 +    }
 +    enerhist->dht->start_time=dhc->start_time;
 +    enerhist->dht->start_lambda=dhc->start_lambda;
 +}
 +
 +
 +
 +/* restore the variables from an energyhistory */
 +void mde_delta_h_coll_restore_energyhistory(t_mde_delta_h_coll *dhc,
 +                                            energyhistory_t *enerhist)
 +{
 +    int i;
 +    unsigned int j;
 +
 +    if (dhc && !enerhist->dht)
 +        gmx_incons("No delta_h histograms in energy history");
 +    if (enerhist->dht->nndh != dhc->ndh)
 +        gmx_incons("energy history number of delta_h histograms != inputrec's number");
 +
 +    for(i=0;i<enerhist->dht->nndh;i++)
 +    {
 +        dhc->dh[i].ndh=enerhist->dht->ndh[i];
 +        for(j=0;j<dhc->dh[i].ndh;j++)
 +        {
 +            dhc->dh[i].dh[j] = enerhist->dht->dh[i][j];
 +        }
 +    }
 +    dhc->start_time=enerhist->dht->start_time;
 +    if (enerhist->dht->start_lambda_set)
 +        dhc->start_lambda=enerhist->dht->start_lambda;
 +    if (dhc->dh[0].ndh > 0)
 +        dhc->start_time_set=TRUE;
 +    else
 +        dhc->start_time_set=FALSE;
 +}
 +
 +
 +
 +
index bb4ffd21ed607e4fac913ce3cd36d513a371ba97,0000000000000000000000000000000000000000..1db14b6d8361c3305edb84f6aed5a338a113a49f
mode 100644,000000..100644
--- /dev/null
@@@ -1,152 -1,0 +1,182 @@@
-     real *dh; /* the raw energy difference data -- actually, store more in here. */
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _mdebin_bar_h
 +#define _mdebin_bar_h
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
++
 +/* The functions & data structures here describe writing
 +   energy differences (or their histogram )for use with g_bar */
 +
 +/* Data for one foreign lambda, or derivative. */
 +typedef struct
 +{
-     gmx_bool derivative; /* whether this delta_h contains derivatives */
-     double lambda; /* current lambda */
++    real *dh; /* the raw energy data. */
 +    float *dhf; /* raw difference data -- in floats, for storage. */
 +    unsigned int ndh; /* number of data points */
 +    unsigned int ndhmax; /* the maximum number of points */
 +
 +    int nhist; /* the number of histograms. There can either be
 +                  0 (for no histograms)
 +                  1 (for 'foreign lambda' histograms)
 +                  2 (for derivative histograms: there's
 +                     a 'forward' and 'backward' histogram
 +                     containing the minimum and maximum
 +                     values, respectively). */
 +    int *bin[2]; /* the histogram(s) */
 +    double dx; /* the histogram spacing in kJ/mol. This is the
 +                  same for the two histograms? */
 +    unsigned int nbins; /* the number of bins in the histograms*/
 +    gmx_large_int_t x0[2]; /* the starting point in units of spacing
 +                              of the histogram */
 +    unsigned int maxbin[2]; /* highest bin number with data */
 +
-     double subblock_d[4]; /* data for an mdebin subblock for I/O. */
-     gmx_large_int_t subblock_l[4]; /* data for an mdebin subblock for I/O.  */
-     int subblock_i[4]; /* data for an mdebin subblock for I/O.  */
++    int type;       /* the block type according to dhbtDH, etc. */
++    int derivative; /* The derivative direction (as an index in the lambda
++                       vector) if this delta_h contains derivatives */
++    double *lambda; /* lambda vector (or NULL if not applicable) */
++    int nlambda;    /* length of the lambda vector */
 +    gmx_bool written;    /* whether this data has already been written out */
 +
-     double subblock_d[5]; /* data for writing an mdebin subblock for I/O */
++    gmx_large_int_t subblock_meta_l[5]; /* metadata for an mdebin subblock for
++                                           I/O: for histogram counts, etc.*/
++    double *subblock_meta_d; /* metadata subblock for I/O, used for
++                                communicating doubles (i.e. the lambda
++                                vector) */
++    int subblock_meta_i[4]; /* metadata subblock for I/O, used for
++                               communicating ints (i.e. derivative indices,
++                               etc.) */
 +} t_mde_delta_h;
 +
 +/* the type definition is in mdebin_bar.h */
 +struct t_mde_delta_h_coll
 +{
 +    t_mde_delta_h *dh; /* the delta h data */
 +    int ndh; /* the number of delta_h structures */
++
++    int nlambda; /* number of bar dU delta_h structures */
++    t_mde_delta_h *dh_du; /* the delta h data (pointer into dh) */
++
++    int ndhdl; /* number of bar dU delta_h structures */
++    t_mde_delta_h *dh_dhdl; /* the dhdl data (pointer into dh) */
++
++    t_mde_delta_h *dh_energy; /* energy output block (pointer into dh) */
++    t_mde_delta_h *dh_pv; /* pV output block (pointer into dh) */
++    t_mde_delta_h *dh_expanded; /* expanded ensemble output block (pointer 
++                                   into dh) */
++
 +    double start_time; /* start time of the current dh collection */
 +    double delta_time; /* time difference between samples */
 +    gmx_bool start_time_set; /* whether the start time has been set */
 +    double start_lambda; /* starting lambda for continuous motion of state*/
 +    double delta_lambda; /* delta lambda, for continuous motion of state */
 +    double temperature; /* the temperature of the samples*/
-                              int bExpanded,
-                              int bPrintEnergy,
-                              int bPressure,
-                              int ndhdl,
-                              int nlambda,
++
++    double *native_lambda_vec; /* The lambda vector describing the current
++                                  lambda state if it is set (NULL otherwise) */
++    int n_lambda_vec; /* the size of the native lambda vector */
++    int *native_lambda_components; /* the native lambda (and by extension,
++                                      foreign lambda) components in terms
++                                      of efptFEP, efptMASS, etc. */
++    int lambda_index; /* the lambda_fep_state */
++
++    double *subblock_d; /* for writing a metadata mdebin subblock for I/O */
++    int *subblock_i; /* for writing a metadata mdebin subblock for I/O */
++
++    double *lambda_vec_subblock; /* native lambda vector data subblock for
++                                    I/O */
++    int *lambda_index_subblock; /* lambda vector index data subblock for I/O */
 +};
 +
 +
 +
 +/* initialize a collection of delta h histograms/sets
 +    dhc = the collection
 +    ir = the input record */
 +
 +void mde_delta_h_coll_init(t_mde_delta_h_coll *dhc,
 +                           const t_inputrec *ir);
 +
 +/* add a bunch of samples to the delta_h collection
 +    dhc = the collection
 +    dhdl = the hamiltonian derivatives
 +    U = the array with energies: from enerd->enerpart_lambda.
 +    time = the current simulation time.
 +    current_lambda = current lambda values : primarily useful for continuous processes
 +    fep_state = current fep_state
 + */
 +
 +/* add a bunch of samples - note fep_state is double to allow for better data storage */
 +void mde_delta_h_coll_add_dh(t_mde_delta_h_coll *dhc,
 +                             double fep_state,
 +                             double energy,
 +                             double pV,
 +                             double *dhdl,
 +                             double *foreign_dU,
 +                             double time);
 +
 +/* write the data associated with the du blocks collection as a collection
 +    of mdebin blocks.
 +    dhc = the collection
 +    fr = the enxio frame
 +    nblock = the current number of blocks */
 +void mde_delta_h_coll_handle_block(t_mde_delta_h_coll *dhc,
 +                                   t_enxframe *fr, int nblock);
 +
 +
 +/* reset the collection of delta_h buffers for a new round of
 +   data gathering */
 +void mde_delta_h_coll_reset(t_mde_delta_h_coll *dhc);
 +
 +
 +/* set the energyhistory variables to save state */
 +void mde_delta_h_coll_update_energyhistory(t_mde_delta_h_coll *dhc,
 +                                           energyhistory_t *enerhist);
 +
 +/* restore the variables from an energyhistory */
 +void mde_delta_h_coll_restore_energyhistory(t_mde_delta_h_coll *dhc,
 +                                            energyhistory_t *enerhist);
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _mdebin_bar_h */
 +
index 196c439947268084a5befe6d5e45dc7b99f6856f,0000000000000000000000000000000000000000..ae828bb855d4dcac9557c32a00678cfef9c24331
mode 100644,000000..100644
--- /dev/null
@@@ -1,2544 -1,0 +1,2549 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <time.h>
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "string2.h"
 +#include "network.h"
 +#include "confio.h"
 +#include "copyrite.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "random.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "typedefs.h"
 +#include "update.h"
 +#include "constr.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "tgroup.h"
 +#include "mdebin.h"
 +#include "vsite.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "trnio.h"
 +#include "mdatoms.h"
 +#include "ns.h"
 +#include "gmx_wallcycle.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "pme.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +
 +#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +
 +typedef struct {
 +  t_state s;
 +  rvec    *f;
 +  real    epot;
 +  real    fnorm;
 +  real    fmax;
 +  int     a_fmax;
 +} em_state_t;
 +
 +static em_state_t *init_em_state()
 +{
 +  em_state_t *ems;
 +
 +  snew(ems,1);
 +
 +  /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
 +  snew(ems->s.lambda,efptNR);
 +
 +  return ems;
 +}
 +
 +static void print_em_start(FILE *fplog,t_commrec *cr,gmx_runtime_t *runtime,
 +                           gmx_wallcycle_t wcycle,
 +                           const char *name)
 +{
 +    char buf[STRLEN];
 +
 +    runtime_start(runtime);
 +
 +    sprintf(buf,"Started %s",name);
 +    print_date_and_time(fplog,cr->nodeid,buf,NULL);
 +
 +    wallcycle_start(wcycle,ewcRUN);
 +}
 +static void em_time_end(FILE *fplog,t_commrec *cr,gmx_runtime_t *runtime,
 +                        gmx_wallcycle_t wcycle)
 +{
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    runtime_end(runtime);
 +}
 +
 +static void sp_header(FILE *out,const char *minimizer,real ftol,int nsteps)
 +{
 +    fprintf(out,"\n");
 +    fprintf(out,"%s:\n",minimizer);
 +    fprintf(out,"   Tolerance (Fmax)   = %12.5e\n",ftol);
 +    fprintf(out,"   Number of steps    = %12d\n",nsteps);
 +}
 +
 +static void warn_step(FILE *fp,real ftol,gmx_bool bLastStep,gmx_bool bConstrain)
 +{
 +    char buffer[2048];
 +    if (bLastStep)
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization reached the maximum number"
 +                "of steps before the forces reached the requested"
 +                "precision Fmax < %g.\n",ftol);
 +    }
 +    else
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization has stopped, but the forces have"
 +                "not converged to the requested precision Fmax < %g (which"
 +                "may not be possible for your system). It stopped"
 +                "because the algorithm tried to make a new step whose size"
 +                "was too small, or there was no change in the energy since"
 +                "last step. Either way, we regard the minimization as"
 +                "converged to within the available machine precision,"
 +                "given your starting configuration and EM parameters.\n%s%s",
 +                ftol,
 +                sizeof(real)<sizeof(double) ?
 +                "\nDouble precision normally gives you higher accuracy, but"
 +                "this is often not needed for preparing to run molecular"
 +                "dynamics.\n" :
 +                "",
 +                bConstrain ?
 +                "You might need to increase your constraint accuracy, or turn\n"
 +                "off constraints altogether (set constraints = none in mdp file)\n" :
 +                "");
 +    }
 +    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
 +}
 +
 +
 +
 +static void print_converged(FILE *fp,const char *alg,real ftol,
 +                          gmx_large_int_t count,gmx_bool bDone,gmx_large_int_t nsteps,
 +                          real epot,real fmax, int nfmax, real fnorm)
 +{
 +  char buf[STEPSTRSIZE];
 +
 +  if (bDone)
 +    fprintf(fp,"\n%s converged to Fmax < %g in %s steps\n",
 +          alg,ftol,gmx_step_str(count,buf));
 +  else if(count<nsteps)
 +    fprintf(fp,"\n%s converged to machine precision in %s steps,\n"
 +               "but did not reach the requested Fmax < %g.\n",
 +          alg,gmx_step_str(count,buf),ftol);
 +  else
 +    fprintf(fp,"\n%s did not converge to Fmax < %g in %s steps.\n",
 +          alg,ftol,gmx_step_str(count,buf));
 +
 +#ifdef GMX_DOUBLE
 +  fprintf(fp,"Potential Energy  = %21.14e\n",epot);
 +  fprintf(fp,"Maximum force     = %21.14e on atom %d\n",fmax,nfmax+1);
 +  fprintf(fp,"Norm of force     = %21.14e\n",fnorm);
 +#else
 +  fprintf(fp,"Potential Energy  = %14.7e\n",epot);
 +  fprintf(fp,"Maximum force     = %14.7e on atom %d\n",fmax,nfmax+1);
 +  fprintf(fp,"Norm of force     = %14.7e\n",fnorm);
 +#endif
 +}
 +
 +static void get_f_norm_max(t_commrec *cr,
 +                         t_grpopts *opts,t_mdatoms *mdatoms,rvec *f,
 +                         real *fnorm,real *fmax,int *a_fmax)
 +{
 +  double fnorm2,*sum;
 +  real fmax2,fmax2_0,fam;
 +  int  la_max,a_max,start,end,i,m,gf;
 +
 +  /* This routine finds the largest force and returns it.
 +   * On parallel machines the global max is taken.
 +   */
 +  fnorm2 = 0;
 +  fmax2 = 0;
 +  la_max = -1;
 +  gf = 0;
 +  start = mdatoms->start;
 +  end   = mdatoms->homenr + start;
 +  if (mdatoms->cFREEZE) {
 +    for(i=start; i<end; i++) {
 +      gf = mdatoms->cFREEZE[i];
 +      fam = 0;
 +      for(m=0; m<DIM; m++)
 +      if (!opts->nFreeze[gf][m])
 +        fam += sqr(f[i][m]);
 +      fnorm2 += fam;
 +      if (fam > fmax2) {
 +      fmax2  = fam;
 +      la_max = i;
 +      }
 +    }
 +  } else {
 +    for(i=start; i<end; i++) {
 +      fam = norm2(f[i]);
 +      fnorm2 += fam;
 +      if (fam > fmax2) {
 +      fmax2  = fam;
 +      la_max = i;
 +      }
 +    }
 +  }
 +
 +  if (la_max >= 0 && DOMAINDECOMP(cr)) {
 +    a_max = cr->dd->gatindex[la_max];
 +  } else {
 +    a_max = la_max;
 +  }
 +  if (PAR(cr)) {
 +    snew(sum,2*cr->nnodes+1);
 +    sum[2*cr->nodeid]   = fmax2;
 +    sum[2*cr->nodeid+1] = a_max;
 +    sum[2*cr->nnodes]   = fnorm2;
 +    gmx_sumd(2*cr->nnodes+1,sum,cr);
 +    fnorm2 = sum[2*cr->nnodes];
 +    /* Determine the global maximum */
 +    for(i=0; i<cr->nnodes; i++) {
 +      if (sum[2*i] > fmax2) {
 +      fmax2 = sum[2*i];
 +      a_max = (int)(sum[2*i+1] + 0.5);
 +      }
 +    }
 +    sfree(sum);
 +  }
 +
 +  if (fnorm)
 +    *fnorm = sqrt(fnorm2);
 +  if (fmax)
 +    *fmax  = sqrt(fmax2);
 +  if (a_fmax)
 +    *a_fmax = a_max;
 +}
 +
 +static void get_state_f_norm_max(t_commrec *cr,
 +                         t_grpopts *opts,t_mdatoms *mdatoms,
 +                         em_state_t *ems)
 +{
 +  get_f_norm_max(cr,opts,mdatoms,ems->f,&ems->fnorm,&ems->fmax,&ems->a_fmax);
 +}
 +
 +void init_em(FILE *fplog,const char *title,
 +             t_commrec *cr,t_inputrec *ir,
 +             t_state *state_global,gmx_mtop_t *top_global,
 +             em_state_t *ems,gmx_localtop_t **top,
 +             rvec **f,rvec **f_global,
 +             t_nrnb *nrnb,rvec mu_tot,
 +             t_forcerec *fr,gmx_enerdata_t **enerd,
 +             t_graph **graph,t_mdatoms *mdatoms,gmx_global_stat_t *gstat,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int nfile,const t_filenm fnm[],
 +             gmx_mdoutf_t **outf,t_mdebin **mdebin)
 +{
 +    int  start,homenr,i;
 +    real dvdlambda;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Initiating %s\n",title);
 +    }
 +
 +    state_global->ngtc = 0;
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog,ir,&(state_global->fep_state),state_global->lambda,NULL);
 +
 +    init_nrnb(nrnb);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        *top = dd_init_local_top(top_global);
 +
 +        dd_init_local_state(cr->dd,state_global,&ems->s);
 +
 +        *f = NULL;
 +
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            &ems->s,&ems->f,mdatoms,*top,
 +                            fr,vsite,NULL,constr,
 +                            nrnb,NULL,FALSE);
 +        dd_store_state(cr->dd,&ems->s);
 +
 +        if (ir->nstfout)
 +        {
 +            snew(*f_global,top_global->natoms);
 +        }
 +        else
 +        {
 +            *f_global = NULL;
 +        }
 +        *graph = NULL;
 +    }
 +    else
 +    {
 +        snew(*f,top_global->natoms);
 +
 +        /* Just copy the state */
 +        ems->s = *state_global;
 +        snew(ems->s.x,ems->s.nalloc);
 +        snew(ems->f,ems->s.nalloc);
 +        for(i=0; i<state_global->natoms; i++)
 +        {
 +            copy_rvec(state_global->x[i],ems->s.x[i]);
 +        }
 +        copy_mat(state_global->box,ems->s.box);
 +
 +        if (PAR(cr) && ir->eI != eiNM)
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            *top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +        }
 +        else
 +        {
 +            *top = gmx_mtop_generate_local_top(top_global,ir);
 +        }
 +        *f_global = *f;
 +
 +        forcerec_set_excl_load(fr,*top,cr);
 +
 +        init_bonded_thread_force_reduction(fr,&(*top)->idef);      
 +        
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            *graph = mk_graph(fplog,&((*top)->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +        else
 +        {
 +            *graph = NULL;
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr,&start,&homenr);
 +            homenr -= start;
 +        }
 +        else
 +        {
 +            start  = 0;
 +            homenr = top_global->natoms;
 +        }
 +        atoms2md(top_global,ir,0,NULL,start,homenr,mdatoms);
 +        update_mdatoms(mdatoms,state_global->lambda[efptFEP]);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite,*top,mdatoms,cr);
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        if (ir->eConstrAlg == econtSHAKE &&
 +            gmx_mtop_ftype_count(top_global,F_CONSTR) > 0)
 +        {
 +            gmx_fatal(FARGS,"Can not do energy minimization with %s, use %s\n",
 +                      econstr_names[econtSHAKE],econstr_names[econtLINCS]);
 +        }
 +
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr,*top,ir,mdatoms,cr);
 +        }
 +
 +        if (!ir->bContinuation)
 +        {
 +            /* Constrain the starting coordinates */
 +            dvdlambda=0;
 +            constrain(PAR(cr) ? NULL : fplog,TRUE,TRUE,constr,&(*top)->idef,
 +                      ir,NULL,cr,-1,0,mdatoms,
 +                      ems->s.x,ems->s.x,NULL,fr->bMolPBC,ems->s.box,
 +                      ems->s.lambda[efptFEP],&dvdlambda,
 +                      NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        *gstat = global_stat_init(ir);
 +    }
 +
 +    *outf = init_mdoutf(nfile,fnm,0,cr,ir,NULL);
 +
 +    snew(*enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
 +                  *enerd);
 +
 +    if (mdebin != NULL)
 +    {
 +        /* Init bin for energy stuff */
 +        *mdebin = init_mdebin((*outf)->fp_ene,top_global,ir,NULL);
 +    }
 +
 +    clear_rvec(mu_tot);
 +    calc_shifts(ems->s.box,fr->shift_vec);
 +}
 +
 +static void finish_em(FILE *fplog,t_commrec *cr,gmx_mdoutf_t *outf,
 +                      gmx_runtime_t *runtime,gmx_wallcycle_t wcycle)
 +{
 +  if (!(cr->duty & DUTY_PME)) {
 +    /* Tell the PME only node to finish */
 +    gmx_pme_send_finish(cr);
 +  }
 +
 +  done_mdoutf(outf);
 +
 +  em_time_end(fplog,cr,runtime,wcycle);
 +}
 +
 +static void swap_em_state(em_state_t *ems1,em_state_t *ems2)
 +{
 +  em_state_t tmp;
 +
 +  tmp   = *ems1;
 +  *ems1 = *ems2;
 +  *ems2 = tmp;
 +}
 +
 +static void copy_em_coords(em_state_t *ems,t_state *state)
 +{
 +    int i;
 +
 +    for(i=0; (i<state->natoms); i++)
 +    {
 +        copy_rvec(ems->s.x[i],state->x[i]);
 +    }
 +}
 +
 +static void write_em_traj(FILE *fplog,t_commrec *cr,
 +                          gmx_mdoutf_t *outf,
 +                          gmx_bool bX,gmx_bool bF,const char *confout,
 +                          gmx_mtop_t *top_global,
 +                          t_inputrec *ir,gmx_large_int_t step,
 +                          em_state_t *state,
 +                          t_state *state_global,rvec *f_global)
 +{
 +    int mdof_flags;
 +
 +    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
 +    {
 +        copy_em_coords(state,state_global);
 +        f_global = state->f;
 +    }
 +
 +    mdof_flags = 0;
 +    if (bX) { mdof_flags |= MDOF_X; }
 +    if (bF) { mdof_flags |= MDOF_F; }
 +    write_traj(fplog,cr,outf,mdof_flags,
 +               top_global,step,(double)step,
 +               &state->s,state_global,state->f,f_global,NULL,NULL);
 +
 +    if (confout != NULL && MASTER(cr))
 +    {
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
 +        {
 +            /* Make molecules whole only for confout writing */
 +            do_pbc_mtop(fplog,ir->ePBC,state_global->box,top_global,
 +                        state_global->x);
 +        }
 +
 +        write_sto_conf_mtop(confout,
 +                            *top_global->name,top_global,
 +                            state_global->x,NULL,ir->ePBC,state_global->box);
 +    }
 +}
 +
 +static void do_em_step(t_commrec *cr,t_inputrec *ir,t_mdatoms *md,
 +                       gmx_bool bMolPBC,
 +                       em_state_t *ems1,real a,rvec *f,em_state_t *ems2,
 +                       gmx_constr_t constr,gmx_localtop_t *top,
 +                       t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                       gmx_large_int_t count)
 +
 +{
 +    t_state *s1,*s2;
 +    int  i;
 +    int  start,end;
 +    rvec *x1,*x2;
 +    real dvdlambda;
 +
 +    s1 = &ems1->s;
 +    s2 = &ems2->s;
 +
 +    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
 +    {
 +        gmx_incons("state mismatch in do_em_step");
 +    }
 +
 +    s2->flags = s1->flags;
 +
 +    if (s2->nalloc != s1->nalloc)
 +    {
 +        s2->nalloc = s1->nalloc;
 +        srenew(s2->x,s1->nalloc);
 +        srenew(ems2->f,  s1->nalloc);
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            srenew(s2->cg_p,  s1->nalloc);
 +        }
 +    }
 +  
 +    s2->natoms = s1->natoms;
 +    copy_mat(s1->box,s2->box);
 +    /* Copy free energy state */
 +    for (i=0;i<efptNR;i++)
 +    {
 +        s2->lambda[i] = s1->lambda[i];
 +    }
 +    copy_mat(s1->box,s2->box);
 +
 +    start = md->start;
 +    end   = md->start + md->homenr;
 +
 +    x1 = s1->x;
 +    x2 = s2->x;
 +
 +#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
 +    {
 +        int gf,i,m;
 +
 +        gf = 0;
 +#pragma omp for schedule(static) nowait
 +        for(i=start; i<end; i++)
 +        {
 +            if (md->cFREEZE)
 +            {
 +                gf = md->cFREEZE[i];
 +            }
 +            for(m=0; m<DIM; m++)
 +            {
 +                if (ir->opts.nFreeze[gf][m])
 +                {
 +                    x2[i][m] = x1[i][m];
 +                }
 +                else
 +                {
 +                    x2[i][m] = x1[i][m] + a*f[i][m];
 +                }
 +            }
 +        }
 +
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            /* Copy the CG p vector */
 +            x1 = s1->cg_p;
 +            x2 = s2->cg_p;
 +#pragma omp for schedule(static) nowait
 +            for(i=start; i<end; i++)
 +            {
 +                copy_rvec(x1[i],x2[i]);
 +            }
 +        }
 +        
 +        if (DOMAINDECOMP(cr))
 +        {
 +            s2->ddp_count = s1->ddp_count;
 +            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
 +            {
 +#pragma omp barrier
 +                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
 +                srenew(s2->cg_gl,s2->cg_gl_nalloc);
 +#pragma omp barrier
 +            }
 +            s2->ncg_gl = s1->ncg_gl;
 +#pragma omp for schedule(static) nowait
 +            for(i=0; i<s2->ncg_gl; i++)
 +            {
 +                s2->cg_gl[i] = s1->cg_gl[i];
 +            }
 +            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
 +        }
 +    }
 +    
 +    if (constr)
 +    {
 +        wallcycle_start(wcycle,ewcCONSTR);
 +        dvdlambda = 0;
 +        constrain(NULL,TRUE,TRUE,constr,&top->idef,   
 +                  ir,NULL,cr,count,0,md,
 +                  s1->x,s2->x,NULL,bMolPBC,s2->box,
 +                  s2->lambda[efptBONDED],&dvdlambda,
 +                  NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +        wallcycle_stop(wcycle,ewcCONSTR);
 +    }
 +}
 +
 +static void em_dd_partition_system(FILE *fplog,int step,t_commrec *cr,
 +                                   gmx_mtop_t *top_global,t_inputrec *ir,
 +                                   em_state_t *ems,gmx_localtop_t *top,
 +                                   t_mdatoms *mdatoms,t_forcerec *fr,
 +                                   gmx_vsite_t *vsite,gmx_constr_t constr,
 +                                   t_nrnb *nrnb,gmx_wallcycle_t wcycle)
 +{
 +    /* Repartition the domain decomposition */
 +    wallcycle_start(wcycle,ewcDOMDEC);
 +    dd_partition_system(fplog,step,cr,FALSE,1,
 +                        NULL,top_global,ir,
 +                        &ems->s,&ems->f,
 +                        mdatoms,top,fr,vsite,NULL,constr,
 +                        nrnb,wcycle,FALSE);
 +    dd_store_state(cr->dd,&ems->s);
 +    wallcycle_stop(wcycle,ewcDOMDEC);
 +}
 +
 +static void evaluate_energy(FILE *fplog,gmx_bool bVerbose,t_commrec *cr,
 +                            t_state *state_global,gmx_mtop_t *top_global,
 +                            em_state_t *ems,gmx_localtop_t *top,
 +                            t_inputrec *inputrec,
 +                            t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                            gmx_global_stat_t gstat,
 +                            gmx_vsite_t *vsite,gmx_constr_t constr,
 +                            t_fcdata *fcd,
 +                            t_graph *graph,t_mdatoms *mdatoms,
 +                            t_forcerec *fr,rvec mu_tot,
 +                            gmx_enerdata_t *enerd,tensor vir,tensor pres,
 +                            gmx_large_int_t count,gmx_bool bFirst)
 +{
 +  real t;
 +  gmx_bool bNS;
 +  int  nabnsb;
 +  tensor force_vir,shake_vir,ekin;
 +  real dvdlambda,prescorr,enercorr,dvdlcorr;
 +  real terminate=0;
 +
 +  /* Set the time to the initial time, the time does not change during EM */
 +  t = inputrec->init_t;
 +
 +  if (bFirst ||
 +      (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count)) {
 +    /* This the first state or an old state used before the last ns */
 +    bNS = TRUE;
 +  } else {
 +    bNS = FALSE;
 +    if (inputrec->nstlist > 0) {
 +      bNS = TRUE;
 +    } else if (inputrec->nstlist == -1) {
 +      nabnsb = natoms_beyond_ns_buffer(inputrec,fr,&top->cgs,NULL,ems->s.x);
 +      if (PAR(cr))
 +      gmx_sumi(1,&nabnsb,cr);
 +      bNS = (nabnsb > 0);
 +    }
 +  }
 +
 +  if (vsite)
 +    construct_vsites(fplog,vsite,ems->s.x,nrnb,1,NULL,
 +                   top->idef.iparams,top->idef.il,
 +                   fr->ePBC,fr->bMolPBC,graph,cr,ems->s.box);
 +
 +  if (DOMAINDECOMP(cr)) {
 +    if (bNS) {
 +      /* Repartition the domain decomposition */
 +      em_dd_partition_system(fplog,count,cr,top_global,inputrec,
 +                           ems,top,mdatoms,fr,vsite,constr,
 +                           nrnb,wcycle);
 +    }
 +  }
 +
 +    /* Calc force & energy on new trial position  */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    do_force(fplog,cr,inputrec,
 +             count,nrnb,wcycle,top,top_global,&top_global->groups,
 +             ems->s.box,ems->s.x,&ems->s.hist,
 +             ems->f,force_vir,mdatoms,enerd,fcd,
 +             ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,TRUE,
 +             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
 +             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
 +             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
 +
 +    /* Clear the unused shake virial and pressure */
 +    clear_mat(shake_vir);
 +    clear_mat(pres);
 +
 +    /* Communicate stuff when parallel */
 +    if (PAR(cr) && inputrec->eI != eiNM)
 +    {
 +        wallcycle_start(wcycle,ewcMoveE);
 +
 +        global_stat(fplog,gstat,cr,enerd,force_vir,shake_vir,mu_tot,
 +                    inputrec,NULL,NULL,NULL,1,&terminate,
 +                    top_global,&ems->s,FALSE,
 +                    CGLO_ENERGY |
 +                    CGLO_PRESSURE |
 +                    CGLO_CONSTRAINT |
 +                    CGLO_FIRSTITERATE);
 +
 +        wallcycle_stop(wcycle,ewcMoveE);
 +    }
 +
 +    /* Calculate long range corrections to pressure and energy */
 +    calc_dispcorr(fplog,inputrec,fr,count,top_global->natoms,ems->s.box,ems->s.lambda[efptVDW],
 +                  pres,force_vir,&prescorr,&enercorr,&dvdlcorr);
 +    enerd->term[F_DISPCORR] = enercorr;
 +    enerd->term[F_EPOT] += enercorr;
 +    enerd->term[F_PRES] += prescorr;
 +    enerd->term[F_DVDL] += dvdlcorr;
 +
 +  ems->epot = enerd->term[F_EPOT];
 +
 +  if (constr) {
 +    /* Project out the constraint components of the force */
 +    wallcycle_start(wcycle,ewcCONSTR);
 +    dvdlambda = 0;
 +    constrain(NULL,FALSE,FALSE,constr,&top->idef,
 +              inputrec,NULL,cr,count,0,mdatoms,
 +              ems->s.x,ems->f,ems->f,fr->bMolPBC,ems->s.box,
 +              ems->s.lambda[efptBONDED],&dvdlambda,
 +              NULL,&shake_vir,nrnb,econqForceDispl,FALSE,0,0);
 +    if (fr->bSepDVDL && fplog)
 +      fprintf(fplog,sepdvdlformat,"Constraints",t,dvdlambda);
 +    enerd->term[F_DVDL_BONDED] += dvdlambda;
 +    m_add(force_vir,shake_vir,vir);
 +    wallcycle_stop(wcycle,ewcCONSTR);
 +  } else {
 +    copy_mat(force_vir,vir);
 +  }
 +
 +  clear_mat(ekin);
 +  enerd->term[F_PRES] =
 +    calc_pres(fr->ePBC,inputrec->nwall,ems->s.box,ekin,vir,pres);
 +
 +  sum_dhdl(enerd,ems->s.lambda,inputrec->fepvals);
 +
 +    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
 +    {
 +        get_state_f_norm_max(cr,&(inputrec->opts),mdatoms,ems);
 +    }
 +}
 +
 +static double reorder_partsum(t_commrec *cr,t_grpopts *opts,t_mdatoms *mdatoms,
 +                            gmx_mtop_t *mtop,
 +                            em_state_t *s_min,em_state_t *s_b)
 +{
 +  rvec *fm,*fb,*fmg;
 +  t_block *cgs_gl;
 +  int ncg,*cg_gl,*index,c,cg,i,a0,a1,a,gf,m;
 +  double partsum;
 +  unsigned char *grpnrFREEZE;
 +
 +  if (debug)
 +    fprintf(debug,"Doing reorder_partsum\n");
 +
 +  fm = s_min->f;
 +  fb = s_b->f;
 +
 +  cgs_gl = dd_charge_groups_global(cr->dd);
 +  index = cgs_gl->index;
 +
 +  /* Collect fm in a global vector fmg.
 +   * This conflicts with the spirit of domain decomposition,
 +   * but to fully optimize this a much more complicated algorithm is required.
 +   */
 +  snew(fmg,mtop->natoms);
 +
 +  ncg   = s_min->s.ncg_gl;
 +  cg_gl = s_min->s.cg_gl;
 +  i = 0;
 +  for(c=0; c<ncg; c++) {
 +    cg = cg_gl[c];
 +    a0 = index[cg];
 +    a1 = index[cg+1];
 +    for(a=a0; a<a1; a++) {
 +      copy_rvec(fm[i],fmg[a]);
 +      i++;
 +    }
 +  }
 +  gmx_sum(mtop->natoms*3,fmg[0],cr);
 +
 +  /* Now we will determine the part of the sum for the cgs in state s_b */
 +  ncg   = s_b->s.ncg_gl;
 +  cg_gl = s_b->s.cg_gl;
 +  partsum = 0;
 +  i = 0;
 +  gf = 0;
 +  grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
 +  for(c=0; c<ncg; c++) {
 +    cg = cg_gl[c];
 +    a0 = index[cg];
 +    a1 = index[cg+1];
 +    for(a=a0; a<a1; a++) {
 +      if (mdatoms->cFREEZE && grpnrFREEZE) {
 +      gf = grpnrFREEZE[i];
 +      }
 +      for(m=0; m<DIM; m++) {
 +      if (!opts->nFreeze[gf][m]) {
 +        partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
 +      }
 +      }
 +      i++;
 +    }
 +  }
 +
 +  sfree(fmg);
 +
 +  return partsum;
 +}
 +
 +static real pr_beta(t_commrec *cr,t_grpopts *opts,t_mdatoms *mdatoms,
 +                  gmx_mtop_t *mtop,
 +                  em_state_t *s_min,em_state_t *s_b)
 +{
 +  rvec *fm,*fb;
 +  double sum;
 +  int  gf,i,m;
 +
 +  /* This is just the classical Polak-Ribiere calculation of beta;
 +   * it looks a bit complicated since we take freeze groups into account,
 +   * and might have to sum it in parallel runs.
 +   */
 +
 +  if (!DOMAINDECOMP(cr) ||
 +      (s_min->s.ddp_count == cr->dd->ddp_count &&
 +       s_b->s.ddp_count   == cr->dd->ddp_count)) {
 +    fm = s_min->f;
 +    fb = s_b->f;
 +    sum = 0;
 +    gf = 0;
 +    /* This part of code can be incorrect with DD,
 +     * since the atom ordering in s_b and s_min might differ.
 +     */
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      if (mdatoms->cFREEZE)
 +      gf = mdatoms->cFREEZE[i];
 +      for(m=0; m<DIM; m++)
 +      if (!opts->nFreeze[gf][m]) {
 +        sum += (fb[i][m] - fm[i][m])*fb[i][m];
 +      }
 +    }
 +  } else {
 +    /* We need to reorder cgs while summing */
 +    sum = reorder_partsum(cr,opts,mdatoms,mtop,s_min,s_b);
 +  }
 +  if (PAR(cr))
 +    gmx_sumd(1,&sum,cr);
 +
 +  return sum/sqr(s_min->fnorm);
 +}
 +
 +double do_cg(FILE *fplog,t_commrec *cr,
 +             int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global,t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +             gmx_membed_t membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +  const char *CG="Polak-Ribiere Conjugate Gradients";
 +
 +  em_state_t *s_min,*s_a,*s_b,*s_c;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  rvec   *f_global,*p,*sf,*sfm;
 +  double gpa,gpb,gpc,tmp,sum[2],minstep;
 +  real   fnormn;
 +  real   stepsize;
 +  real   a,b,c,beta=0.0;
 +  real   epot_repl=0;
 +  real   pnorm;
 +  t_mdebin   *mdebin;
 +  gmx_bool   converged,foundlower;
 +  rvec   mu_tot;
 +  gmx_bool   do_log=FALSE,do_ene=FALSE,do_x,do_f;
 +  tensor vir,pres;
 +  int    number_steps,neval=0,nstcg=inputrec->nstcgsteep;
 +  gmx_mdoutf_t *outf;
 +  int    i,m,gf,step,nminstep;
 +  real   terminate=0;
 +
 +  step=0;
 +
 +  s_min = init_em_state();
 +  s_a   = init_em_state();
 +  s_b   = init_em_state();
 +  s_c   = init_em_state();
 +
 +  /* Init em and store the local state in s_min */
 +  init_em(fplog,CG,cr,inputrec,
 +          state_global,top_global,s_min,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +
 +  /* Print to log file */
 +  print_em_start(fplog,cr,runtime,wcycle,CG);
 +
 +  /* Max number of steps */
 +  number_steps=inputrec->nsteps;
 +
 +  if (MASTER(cr))
 +    sp_header(stderr,CG,inputrec->em_tol,number_steps);
 +  if (fplog)
 +    sp_header(fplog,CG,inputrec->em_tol,number_steps);
 +
 +  /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +  /* do_force always puts the charge groups in the box and shifts again
 +   * We do not unshift, so molecules are always whole in congrad.c
 +   */
 +  evaluate_energy(fplog,bVerbose,cr,
 +                state_global,top_global,s_min,top,
 +                inputrec,nrnb,wcycle,gstat,
 +                vsite,constr,fcd,graph,mdatoms,fr,
 +                mu_tot,enerd,vir,pres,-1,TRUE);
 +  where();
 +
 +  if (MASTER(cr)) {
 +    /* Copy stuff to the energy bin for easy printing etc. */
 +    upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +               mdatoms->tmass,enerd,&s_min->s,inputrec->fepvals,inputrec->expandedvals,s_min->s.box,
 +               NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +
 +    print_ebin_header(fplog,step,step,s_min->s.lambda[efptFEP]);
 +    print_ebin(outf->fp_ene,TRUE,FALSE,FALSE,fplog,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +  }
 +  where();
 +
 +  /* Estimate/guess the initial stepsize */
 +  stepsize = inputrec->em_stepsize/s_min->fnorm;
 +
 +  if (MASTER(cr)) {
 +    fprintf(stderr,"   F-max             = %12.5e on atom %d\n",
 +          s_min->fmax,s_min->a_fmax+1);
 +    fprintf(stderr,"   F-Norm            = %12.5e\n",
 +          s_min->fnorm/sqrt(state_global->natoms));
 +    fprintf(stderr,"\n");
 +    /* and copy to the log file too... */
 +    fprintf(fplog,"   F-max             = %12.5e on atom %d\n",
 +          s_min->fmax,s_min->a_fmax+1);
 +    fprintf(fplog,"   F-Norm            = %12.5e\n",
 +          s_min->fnorm/sqrt(state_global->natoms));
 +    fprintf(fplog,"\n");
 +  }
 +  /* Start the loop over CG steps.
 +   * Each successful step is counted, and we continue until
 +   * we either converge or reach the max number of steps.
 +   */
 +  converged = FALSE;
 +  for(step=0; (number_steps<0 || (number_steps>=0 && step<=number_steps)) && !converged;step++) {
 +
 +    /* start taking steps in a new direction
 +     * First time we enter the routine, beta=0, and the direction is
 +     * simply the negative gradient.
 +     */
 +
 +    /* Calculate the new direction in p, and the gradient in this direction, gpa */
 +    p  = s_min->s.cg_p;
 +    sf = s_min->f;
 +    gpa = 0;
 +    gf = 0;
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      if (mdatoms->cFREEZE)
 +      gf = mdatoms->cFREEZE[i];
 +      for(m=0; m<DIM; m++) {
 +      if (!inputrec->opts.nFreeze[gf][m]) {
 +        p[i][m] = sf[i][m] + beta*p[i][m];
 +        gpa -= p[i][m]*sf[i][m];
 +        /* f is negative gradient, thus the sign */
 +      } else {
 +          p[i][m] = 0;
 +      }
 +      }
 +    }
 +
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpa,cr);
 +
 +    /* Calculate the norm of the search vector */
 +    get_f_norm_max(cr,&(inputrec->opts),mdatoms,p,&pnorm,NULL,NULL);
 +
 +    /* Just in case stepsize reaches zero due to numerical precision... */
 +    if(stepsize<=0)
 +      stepsize = inputrec->em_stepsize/pnorm;
 +
 +    /*
 +     * Double check the value of the derivative in the search direction.
 +     * If it is positive it must be due to the old information in the
 +     * CG formula, so just remove that and start over with beta=0.
 +     * This corresponds to a steepest descent step.
 +     */
 +    if(gpa>0) {
 +      beta = 0;
 +      step--; /* Don't count this step since we are restarting */
 +      continue; /* Go back to the beginning of the big for-loop */
 +    }
 +
 +    /* Calculate minimum allowed stepsize, before the average (norm)
 +     * relative change in coordinate is smaller than precision
 +     */
 +    minstep=0;
 +    for (i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      for(m=0; m<DIM; m++) {
 +      tmp = fabs(s_min->s.x[i][m]);
 +      if(tmp < 1.0)
 +        tmp = 1.0;
 +      tmp = p[i][m]/tmp;
 +      minstep += tmp*tmp;
 +      }
 +    }
 +    /* Add up from all CPUs */
 +    if(PAR(cr))
 +      gmx_sumd(1,&minstep,cr);
 +
 +    minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
 +
 +    if(stepsize<minstep) {
 +      converged=TRUE;
 +      break;
 +    }
 +
 +    /* Write coordinates if necessary */
 +    do_x = do_per_step(step,inputrec->nstxout);
 +    do_f = do_per_step(step,inputrec->nstfout);
 +
 +    write_em_traj(fplog,cr,outf,do_x,do_f,NULL,
 +                  top_global,inputrec,step,
 +                  s_min,state_global,f_global);
 +
 +    /* Take a step downhill.
 +     * In theory, we should minimize the function along this direction.
 +     * That is quite possible, but it turns out to take 5-10 function evaluations
 +     * for each line. However, we dont really need to find the exact minimum -
 +     * it is much better to start a new CG step in a modified direction as soon
 +     * as we are close to it. This will save a lot of energy evaluations.
 +     *
 +     * In practice, we just try to take a single step.
 +     * If it worked (i.e. lowered the energy), we increase the stepsize but
 +     * the continue straight to the next CG step without trying to find any minimum.
 +     * If it didn't work (higher energy), there must be a minimum somewhere between
 +     * the old position and the new one.
 +     *
 +     * Due to the finite numerical accuracy, it turns out that it is a good idea
 +     * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +     * This leads to lower final energies in the tests I've done. / Erik
 +     */
 +    s_a->epot = s_min->epot;
 +    a = 0.0;
 +    c = a + stepsize; /* reference position along line is zero */
 +
 +    if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count) {
 +      em_dd_partition_system(fplog,step,cr,top_global,inputrec,
 +                           s_min,top,mdatoms,fr,vsite,constr,
 +                           nrnb,wcycle);
 +    }
 +
 +    /* Take a trial step (new coords in s_c) */
 +    do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,s_min,c,s_min->s.cg_p,s_c,
 +               constr,top,nrnb,wcycle,-1);
 +
 +    neval++;
 +    /* Calculate energy for the trial step */
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state_global,top_global,s_c,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,-1,FALSE);
 +
 +    /* Calc derivative along line */
 +    p  = s_c->s.cg_p;
 +    sf = s_c->f;
 +    gpc=0;
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      for(m=0; m<DIM; m++)
 +        gpc -= p[i][m]*sf[i][m];  /* f is negative gradient, thus the sign */
 +    }
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpc,cr);
 +
 +    /* This is the max amount of increase in energy we tolerate */
 +    tmp=sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
 +
 +    /* Accept the step if the energy is lower, or if it is not significantly higher
 +     * and the line derivative is still negative.
 +     */
 +    if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp))) {
 +      foundlower = TRUE;
 +      /* Great, we found a better energy. Increase step for next iteration
 +       * if we are still going down, decrease it otherwise
 +       */
 +      if(gpc<0)
 +      stepsize *= 1.618034;  /* The golden section */
 +      else
 +      stepsize *= 0.618034;  /* 1/golden section */
 +    } else {
 +      /* New energy is the same or higher. We will have to do some work
 +       * to find a smaller value in the interval. Take smaller step next time!
 +       */
 +      foundlower = FALSE;
 +      stepsize *= 0.618034;
 +    }
 +
 +
 +
 +
 +    /* OK, if we didn't find a lower value we will have to locate one now - there must
 +     * be one in the interval [a=0,c].
 +     * The same thing is valid here, though: Don't spend dozens of iterations to find
 +     * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +     * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +     *
 +     * I also have a safeguard for potentially really patological functions so we never
 +     * take more than 20 steps before we give up ...
 +     *
 +     * If we already found a lower value we just skip this step and continue to the update.
 +     */
 +    if (!foundlower) {
 +      nminstep=0;
 +
 +      do {
 +      /* Select a new trial point.
 +       * If the derivatives at points a & c have different sign we interpolate to zero,
 +       * otherwise just do a bisection.
 +       */
 +      if(gpa<0 && gpc>0)
 +        b = a + gpa*(a-c)/(gpc-gpa);
 +      else
 +        b = 0.5*(a+c);
 +
 +      /* safeguard if interpolation close to machine accuracy causes errors:
 +       * never go outside the interval
 +       */
 +      if(b<=a || b>=c)
 +        b = 0.5*(a+c);
 +
 +      if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) {
 +        /* Reload the old state */
 +        em_dd_partition_system(fplog,-1,cr,top_global,inputrec,
 +                               s_min,top,mdatoms,fr,vsite,constr,
 +                               nrnb,wcycle);
 +      }
 +
 +      /* Take a trial step to this new point - new coords in s_b */
 +      do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,s_min,b,s_min->s.cg_p,s_b,
 +               constr,top,nrnb,wcycle,-1);
 +
 +      neval++;
 +      /* Calculate energy for the trial step */
 +      evaluate_energy(fplog,bVerbose,cr,
 +                      state_global,top_global,s_b,top,
 +                      inputrec,nrnb,wcycle,gstat,
 +                      vsite,constr,fcd,graph,mdatoms,fr,
 +                      mu_tot,enerd,vir,pres,-1,FALSE);
 +
 +      /* p does not change within a step, but since the domain decomposition
 +       * might change, we have to use cg_p of s_b here.
 +       */
 +      p  = s_b->s.cg_p;
 +      sf = s_b->f;
 +      gpb=0;
 +      for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +        for(m=0; m<DIM; m++)
 +            gpb -= p[i][m]*sf[i][m];   /* f is negative gradient, thus the sign */
 +      }
 +      /* Sum the gradient along the line across CPUs */
 +      if (PAR(cr))
 +        gmx_sumd(1,&gpb,cr);
 +
 +      if (debug)
 +        fprintf(debug,"CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
 +                s_a->epot,s_b->epot,s_c->epot,gpb);
 +
 +      epot_repl = s_b->epot;
 +
 +      /* Keep one of the intervals based on the value of the derivative at the new point */
 +      if (gpb > 0) {
 +        /* Replace c endpoint with b */
 +        swap_em_state(s_b,s_c);
 +        c = b;
 +        gpc = gpb;
 +      } else {
 +        /* Replace a endpoint with b */
 +        swap_em_state(s_b,s_a);
 +        a = b;
 +        gpa = gpb;
 +      }
 +
 +      /*
 +       * Stop search as soon as we find a value smaller than the endpoints.
 +       * Never run more than 20 steps, no matter what.
 +       */
 +      nminstep++;
 +      } while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
 +             (nminstep < 20));
 +
 +      if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
 +        nminstep >= 20) {
 +      /* OK. We couldn't find a significantly lower energy.
 +       * If beta==0 this was steepest descent, and then we give up.
 +       * If not, set beta=0 and restart with steepest descent before quitting.
 +         */
 +      if (beta == 0.0) {
 +        /* Converged */
 +        converged = TRUE;
 +        break;
 +      } else {
 +        /* Reset memory before giving up */
 +        beta = 0.0;
 +        continue;
 +      }
 +      }
 +
 +      /* Select min energy state of A & C, put the best in B.
 +       */
 +      if (s_c->epot < s_a->epot) {
 +      if (debug)
 +        fprintf(debug,"CGE: C (%f) is lower than A (%f), moving C to B\n",
 +                s_c->epot,s_a->epot);
 +      swap_em_state(s_b,s_c);
 +      gpb = gpc;
 +      b = c;
 +      } else {
 +      if (debug)
 +        fprintf(debug,"CGE: A (%f) is lower than C (%f), moving A to B\n",
 +                s_a->epot,s_c->epot);
 +      swap_em_state(s_b,s_a);
 +      gpb = gpa;
 +      b = a;
 +      }
 +
 +    } else {
 +      if (debug)
 +      fprintf(debug,"CGE: Found a lower energy %f, moving C to B\n",
 +              s_c->epot);
 +      swap_em_state(s_b,s_c);
 +      gpb = gpc;
 +      b = c;
 +    }
 +
 +    /* new search direction */
 +    /* beta = 0 means forget all memory and restart with steepest descents. */
 +    if (nstcg && ((step % nstcg)==0))
 +      beta = 0.0;
 +    else {
 +      /* s_min->fnorm cannot be zero, because then we would have converged
 +       * and broken out.
 +       */
 +
 +      /* Polak-Ribiere update.
 +       * Change to fnorm2/fnorm2_old for Fletcher-Reeves
 +       */
 +      beta = pr_beta(cr,&inputrec->opts,mdatoms,top_global,s_min,s_b);
 +    }
 +    /* Limit beta to prevent oscillations */
 +    if (fabs(beta) > 5.0)
 +      beta = 0.0;
 +
 +
 +    /* update positions */
 +    swap_em_state(s_min,s_b);
 +    gpa = gpb;
 +
 +    /* Print it if necessary */
 +    if (MASTER(cr)) {
 +      if(bVerbose)
 +      fprintf(stderr,"\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +              step,s_min->epot,s_min->fnorm/sqrt(state_global->natoms),
 +              s_min->fmax,s_min->a_fmax+1);
 +      /* Store the new (lower) energies */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +                 mdatoms->tmass,enerd,&s_min->s,inputrec->fepvals,inputrec->expandedvals,s_min->s.box,
 +                 NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +
 +      do_log = do_per_step(step,inputrec->nstlog);
 +      do_ene = do_per_step(step,inputrec->nstenergy);
 +      if(do_log)
 +          print_ebin_header(fplog,step,step,s_min->s.lambda[efptFEP]);
 +      print_ebin(outf->fp_ene,do_ene,FALSE,FALSE,
 +               do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +
 +    /* Stop when the maximum force lies below tolerance.
 +     * If we have reached machine precision, converged is already set to true.
 +     */
 +    converged = converged || (s_min->fmax < inputrec->em_tol);
 +
 +  } /* End of the loop */
 +
 +  if (converged)
 +    step--; /* we never took that last step in this case */
 +
 +    if (s_min->fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr,inputrec->em_tol,step-1==number_steps,FALSE);
 +            warn_step(fplog ,inputrec->em_tol,step-1==number_steps,FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +  if (MASTER(cr)) {
 +    /* If we printed energy and/or logfile last step (which was the last step)
 +     * we don't have to do it again, but otherwise print the final values.
 +     */
 +    if(!do_log) {
 +      /* Write final value to log since we didn't do anything the last step */
 +      print_ebin_header(fplog,step,step,s_min->s.lambda[efptFEP]);
 +    }
 +    if (!do_ene || !do_log) {
 +      /* Write final energy file entries */
 +      print_ebin(outf->fp_ene,!do_ene,FALSE,FALSE,
 +               !do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +  }
 +
 +  /* Print some stuff... */
 +  if (MASTER(cr))
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n");
 +
 +  /* IMPORTANT!
 +   * For accurate normal mode calculation it is imperative that we
 +   * store the last conformation into the full precision binary trajectory.
 +   *
 +   * However, we should only do it if we did NOT already write this step
 +   * above (which we did if do_x or do_f was true).
 +   */
 +  do_x = !do_per_step(step,inputrec->nstxout);
 +  do_f = (inputrec->nstfout > 0 && !do_per_step(step,inputrec->nstfout));
 +
 +  write_em_traj(fplog,cr,outf,do_x,do_f,ftp2fn(efSTO,nfile,fnm),
 +                top_global,inputrec,step,
 +                s_min,state_global,f_global);
 +
 +  fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +  if (MASTER(cr)) {
 +    print_converged(stderr,CG,inputrec->em_tol,step,converged,number_steps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +    print_converged(fplog,CG,inputrec->em_tol,step,converged,number_steps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +
 +    fprintf(fplog,"\nPerformed %d energy evaluations in total.\n",neval);
 +  }
 +
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +  /* To print the actual number of steps we needed somewhere */
 +  runtime->nsteps_done = step;
 +
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_lbfgs(FILE *fplog,t_commrec *cr,
 +                int nfile,const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite,gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global,t_fcdata *fcd,
 +                t_state *state,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                gmx_membed_t membed,
 +                real cpt_period,real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{
 +  static const char *LBFGS="Low-Memory BFGS Minimizer";
 +  em_state_t ems;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  rvec   *f_global;
 +  int    ncorr,nmaxcorr,point,cp,neval,nminstep;
 +  double stepsize,gpa,gpb,gpc,tmp,minstep;
 +  real   *rho,*alpha,*ff,*xx,*p,*s,*lastx,*lastf,**dx,**dg;
 +  real   *xa,*xb,*xc,*fa,*fb,*fc,*xtmp,*ftmp;
 +  real   a,b,c,maxdelta,delta;
 +  real   diag,Epot0,Epot,EpotA,EpotB,EpotC;
 +  real   dgdx,dgdg,sq,yr,beta;
 +  t_mdebin   *mdebin;
 +  gmx_bool   converged,first;
 +  rvec   mu_tot;
 +  real   fnorm,fmax;
 +  gmx_bool   do_log,do_ene,do_x,do_f,foundlower,*frozen;
 +  tensor vir,pres;
 +  int    start,end,number_steps;
 +  gmx_mdoutf_t *outf;
 +  int    i,k,m,n,nfmax,gf,step;
 +  int    mdof_flags;
 +  /* not used */
 +  real   terminate;
 +
 +  if (PAR(cr))
 +    gmx_fatal(FARGS,"Cannot do parallel L-BFGS Minimization - yet.\n");
++  
++  if (NULL != constr)
++  {
++      gmx_fatal(FARGS,"The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
++  }
 +
 +  n = 3*state->natoms;
 +  nmaxcorr = inputrec->nbfgscorr;
 +
 +  /* Allocate memory */
 +  /* Use pointers to real so we dont have to loop over both atoms and
 +   * dimensions all the time...
 +   * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
 +   * that point to the same memory.
 +   */
 +  snew(xa,n);
 +  snew(xb,n);
 +  snew(xc,n);
 +  snew(fa,n);
 +  snew(fb,n);
 +  snew(fc,n);
 +  snew(frozen,n);
 +
 +  snew(p,n);
 +  snew(lastx,n);
 +  snew(lastf,n);
 +  snew(rho,nmaxcorr);
 +  snew(alpha,nmaxcorr);
 +
 +  snew(dx,nmaxcorr);
 +  for(i=0;i<nmaxcorr;i++)
 +    snew(dx[i],n);
 +
 +  snew(dg,nmaxcorr);
 +  for(i=0;i<nmaxcorr;i++)
 +    snew(dg[i],n);
 +
 +  step = 0;
 +  neval = 0;
 +
 +  /* Init em */
 +  init_em(fplog,LBFGS,cr,inputrec,
 +          state,top_global,&ems,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +  /* Do_lbfgs is not completely updated like do_steep and do_cg,
 +   * so we free some memory again.
 +   */
 +  sfree(ems.s.x);
 +  sfree(ems.f);
 +
 +  xx = (real *)state->x;
 +  ff = (real *)f;
 +
 +  start = mdatoms->start;
 +  end   = mdatoms->homenr + start;
 +
 +  /* Print to log file */
 +  print_em_start(fplog,cr,runtime,wcycle,LBFGS);
 +
 +  do_log = do_ene = do_x = do_f = TRUE;
 +
 +  /* Max number of steps */
 +  number_steps=inputrec->nsteps;
 +
 +  /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
 +  gf = 0;
 +  for(i=start; i<end; i++) {
 +    if (mdatoms->cFREEZE)
 +      gf = mdatoms->cFREEZE[i];
 +     for(m=0; m<DIM; m++)
 +       frozen[3*i+m]=inputrec->opts.nFreeze[gf][m];
 +  }
 +  if (MASTER(cr))
 +    sp_header(stderr,LBFGS,inputrec->em_tol,number_steps);
 +  if (fplog)
 +    sp_header(fplog,LBFGS,inputrec->em_tol,number_steps);
 +
 +  if (vsite)
 +    construct_vsites(fplog,vsite,state->x,nrnb,1,NULL,
 +                   top->idef.iparams,top->idef.il,
 +                   fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +
 +  /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +  /* do_force always puts the charge groups in the box and shifts again
 +   * We do not unshift, so molecules are always whole
 +   */
 +  neval++;
 +  ems.s.x = state->x;
 +  ems.f = f;
 +  evaluate_energy(fplog,bVerbose,cr,
 +                state,top_global,&ems,top,
 +                inputrec,nrnb,wcycle,gstat,
 +                vsite,constr,fcd,graph,mdatoms,fr,
 +                mu_tot,enerd,vir,pres,-1,TRUE);
 +  where();
 +
 +  if (MASTER(cr)) {
 +    /* Copy stuff to the energy bin for easy printing etc. */
 +    upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +               mdatoms->tmass,enerd,state,inputrec->fepvals,inputrec->expandedvals,state->box,
 +               NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +
 +    print_ebin_header(fplog,step,step,state->lambda[efptFEP]);
 +    print_ebin(outf->fp_ene,TRUE,FALSE,FALSE,fplog,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +  }
 +  where();
 +
 +  /* This is the starting energy */
 +  Epot = enerd->term[F_EPOT];
 +
 +  fnorm = ems.fnorm;
 +  fmax  = ems.fmax;
 +  nfmax = ems.a_fmax;
 +
 +  /* Set the initial step.
 +   * since it will be multiplied by the non-normalized search direction
 +   * vector (force vector the first time), we scale it by the
 +   * norm of the force.
 +   */
 +
 +  if (MASTER(cr)) {
 +    fprintf(stderr,"Using %d BFGS correction steps.\n\n",nmaxcorr);
 +    fprintf(stderr,"   F-max             = %12.5e on atom %d\n",fmax,nfmax+1);
 +    fprintf(stderr,"   F-Norm            = %12.5e\n",fnorm/sqrt(state->natoms));
 +    fprintf(stderr,"\n");
 +    /* and copy to the log file too... */
 +    fprintf(fplog,"Using %d BFGS correction steps.\n\n",nmaxcorr);
 +    fprintf(fplog,"   F-max             = %12.5e on atom %d\n",fmax,nfmax+1);
 +    fprintf(fplog,"   F-Norm            = %12.5e\n",fnorm/sqrt(state->natoms));
 +    fprintf(fplog,"\n");
 +  }
 +
 +  point=0;
 +  for(i=0;i<n;i++)
 +    if(!frozen[i])
 +      dx[point][i] = ff[i];  /* Initial search direction */
 +    else
 +      dx[point][i] = 0;
 +
 +  stepsize = 1.0/fnorm;
 +  converged = FALSE;
 +
 +  /* Start the loop over BFGS steps.
 +   * Each successful step is counted, and we continue until
 +   * we either converge or reach the max number of steps.
 +   */
 +
 +  ncorr=0;
 +
 +  /* Set the gradient from the force */
 +  converged = FALSE;
 +  for(step=0; (number_steps<0 || (number_steps>=0 && step<=number_steps)) && !converged; step++) {
 +
 +    /* Write coordinates if necessary */
 +    do_x = do_per_step(step,inputrec->nstxout);
 +    do_f = do_per_step(step,inputrec->nstfout);
 +
 +    mdof_flags = 0;
 +    if (do_x)
 +    {
 +        mdof_flags |= MDOF_X;
 +    }
 +
 +    if (do_f)
 +    {
 +        mdof_flags |= MDOF_F;
 +    }
 +
 +    write_traj(fplog,cr,outf,mdof_flags,
 +               top_global,step,(real)step,state,state,f,f,NULL,NULL);
 +
 +    /* Do the linesearching in the direction dx[point][0..(n-1)] */
 +
 +    /* pointer to current direction - point=0 first time here */
 +    s=dx[point];
 +
 +    /* calculate line gradient */
 +    for(gpa=0,i=0;i<n;i++)
 +      gpa-=s[i]*ff[i];
 +
 +    /* Calculate minimum allowed stepsize, before the average (norm)
 +     * relative change in coordinate is smaller than precision
 +     */
 +    for(minstep=0,i=0;i<n;i++) {
 +      tmp=fabs(xx[i]);
 +      if(tmp<1.0)
 +      tmp=1.0;
 +      tmp = s[i]/tmp;
 +      minstep += tmp*tmp;
 +    }
 +    minstep = GMX_REAL_EPS/sqrt(minstep/n);
 +
 +    if(stepsize<minstep) {
 +      converged=TRUE;
 +      break;
 +    }
 +
 +    /* Store old forces and coordinates */
 +    for(i=0;i<n;i++) {
 +      lastx[i]=xx[i];
 +      lastf[i]=ff[i];
 +    }
 +    Epot0=Epot;
 +
 +    first=TRUE;
 +
 +    for(i=0;i<n;i++)
 +      xa[i]=xx[i];
 +
 +    /* Take a step downhill.
 +     * In theory, we should minimize the function along this direction.
 +     * That is quite possible, but it turns out to take 5-10 function evaluations
 +     * for each line. However, we dont really need to find the exact minimum -
 +     * it is much better to start a new BFGS step in a modified direction as soon
 +     * as we are close to it. This will save a lot of energy evaluations.
 +     *
 +     * In practice, we just try to take a single step.
 +     * If it worked (i.e. lowered the energy), we increase the stepsize but
 +     * the continue straight to the next BFGS step without trying to find any minimum.
 +     * If it didn't work (higher energy), there must be a minimum somewhere between
 +     * the old position and the new one.
 +     *
 +     * Due to the finite numerical accuracy, it turns out that it is a good idea
 +     * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +     * This leads to lower final energies in the tests I've done. / Erik
 +     */
 +    foundlower=FALSE;
 +    EpotA = Epot0;
 +    a = 0.0;
 +    c = a + stepsize; /* reference position along line is zero */
 +
 +    /* Check stepsize first. We do not allow displacements
 +     * larger than emstep.
 +     */
 +    do {
 +      c = a + stepsize;
 +      maxdelta=0;
 +      for(i=0;i<n;i++) {
 +      delta=c*s[i];
 +      if(delta>maxdelta)
 +        maxdelta=delta;
 +      }
 +      if(maxdelta>inputrec->em_stepsize)
 +      stepsize*=0.1;
 +    } while(maxdelta>inputrec->em_stepsize);
 +
 +    /* Take a trial step */
 +    for (i=0; i<n; i++)
 +      xc[i] = lastx[i] + c*s[i];
 +
 +    neval++;
 +    /* Calculate energy for the trial step */
 +    ems.s.x = (rvec *)xc;
 +    ems.f   = (rvec *)fc;
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state,top_global,&ems,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,step,FALSE);
 +    EpotC = ems.epot;
 +
 +    /* Calc derivative along line */
 +    for(gpc=0,i=0; i<n; i++) {
 +      gpc -= s[i]*fc[i];   /* f is negative gradient, thus the sign */
 +    }
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpc,cr);
 +
 +     /* This is the max amount of increase in energy we tolerate */
 +   tmp=sqrt(GMX_REAL_EPS)*fabs(EpotA);
 +
 +    /* Accept the step if the energy is lower, or if it is not significantly higher
 +     * and the line derivative is still negative.
 +     */
 +    if(EpotC<EpotA || (gpc<0 && EpotC<(EpotA+tmp))) {
 +      foundlower = TRUE;
 +      /* Great, we found a better energy. Increase step for next iteration
 +       * if we are still going down, decrease it otherwise
 +       */
 +      if(gpc<0)
 +      stepsize *= 1.618034;  /* The golden section */
 +      else
 +      stepsize *= 0.618034;  /* 1/golden section */
 +    } else {
 +      /* New energy is the same or higher. We will have to do some work
 +       * to find a smaller value in the interval. Take smaller step next time!
 +       */
 +      foundlower = FALSE;
 +      stepsize *= 0.618034;
 +    }
 +
 +    /* OK, if we didn't find a lower value we will have to locate one now - there must
 +     * be one in the interval [a=0,c].
 +     * The same thing is valid here, though: Don't spend dozens of iterations to find
 +     * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +     * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +     *
 +     * I also have a safeguard for potentially really patological functions so we never
 +     * take more than 20 steps before we give up ...
 +     *
 +     * If we already found a lower value we just skip this step and continue to the update.
 +     */
 +
 +    if(!foundlower) {
 +
 +      nminstep=0;
 +      do {
 +      /* Select a new trial point.
 +       * If the derivatives at points a & c have different sign we interpolate to zero,
 +       * otherwise just do a bisection.
 +       */
 +
 +      if(gpa<0 && gpc>0)
 +        b = a + gpa*(a-c)/(gpc-gpa);
 +      else
 +        b = 0.5*(a+c);
 +
 +      /* safeguard if interpolation close to machine accuracy causes errors:
 +       * never go outside the interval
 +       */
 +      if(b<=a || b>=c)
 +        b = 0.5*(a+c);
 +
 +      /* Take a trial step */
 +      for (i=0; i<n; i++)
 +        xb[i] = lastx[i] + b*s[i];
 +
 +      neval++;
 +      /* Calculate energy for the trial step */
 +      ems.s.x = (rvec *)xb;
 +      ems.f   = (rvec *)fb;
 +      evaluate_energy(fplog,bVerbose,cr,
 +                      state,top_global,&ems,top,
 +                      inputrec,nrnb,wcycle,gstat,
 +                      vsite,constr,fcd,graph,mdatoms,fr,
 +                      mu_tot,enerd,vir,pres,step,FALSE);
 +      EpotB = ems.epot;
 +
 +      fnorm = ems.fnorm;
 +
 +      for(gpb=0,i=0; i<n; i++)
 +        gpb -= s[i]*fb[i];   /* f is negative gradient, thus the sign */
 +
 +      /* Sum the gradient along the line across CPUs */
 +      if (PAR(cr))
 +        gmx_sumd(1,&gpb,cr);
 +
 +      /* Keep one of the intervals based on the value of the derivative at the new point */
 +      if(gpb>0) {
 +        /* Replace c endpoint with b */
 +        EpotC = EpotB;
 +        c = b;
 +        gpc = gpb;
 +        /* swap coord pointers b/c */
 +        xtmp = xb;
 +        ftmp = fb;
 +        xb = xc;
 +        fb = fc;
 +        xc = xtmp;
 +        fc = ftmp;
 +      } else {
 +        /* Replace a endpoint with b */
 +        EpotA = EpotB;
 +        a = b;
 +        gpa = gpb;
 +        /* swap coord pointers a/b */
 +        xtmp = xb;
 +        ftmp = fb;
 +        xb = xa;
 +        fb = fa;
 +        xa = xtmp;
 +        fa = ftmp;
 +      }
 +
 +      /*
 +       * Stop search as soon as we find a value smaller than the endpoints,
 +       * or if the tolerance is below machine precision.
 +       * Never run more than 20 steps, no matter what.
 +       */
 +      nminstep++;
 +      } while((EpotB>EpotA || EpotB>EpotC) && (nminstep<20));
 +
 +      if(fabs(EpotB-Epot0)<GMX_REAL_EPS || nminstep>=20) {
 +      /* OK. We couldn't find a significantly lower energy.
 +       * If ncorr==0 this was steepest descent, and then we give up.
 +       * If not, reset memory to restart as steepest descent before quitting.
 +         */
 +      if(ncorr==0) {
 +      /* Converged */
 +        converged=TRUE;
 +        break;
 +      } else {
 +        /* Reset memory */
 +        ncorr=0;
 +        /* Search in gradient direction */
 +        for(i=0;i<n;i++)
 +          dx[point][i]=ff[i];
 +        /* Reset stepsize */
 +        stepsize = 1.0/fnorm;
 +        continue;
 +      }
 +      }
 +
 +      /* Select min energy state of A & C, put the best in xx/ff/Epot
 +       */
 +      if(EpotC<EpotA) {
 +      Epot = EpotC;
 +      /* Use state C */
 +      for(i=0;i<n;i++) {
 +        xx[i]=xc[i];
 +        ff[i]=fc[i];
 +      }
 +      stepsize=c;
 +      } else {
 +      Epot = EpotA;
 +      /* Use state A */
 +      for(i=0;i<n;i++) {
 +        xx[i]=xa[i];
 +        ff[i]=fa[i];
 +      }
 +      stepsize=a;
 +      }
 +
 +    } else {
 +      /* found lower */
 +      Epot = EpotC;
 +      /* Use state C */
 +      for(i=0;i<n;i++) {
 +      xx[i]=xc[i];
 +      ff[i]=fc[i];
 +      }
 +      stepsize=c;
 +    }
 +
 +    /* Update the memory information, and calculate a new
 +     * approximation of the inverse hessian
 +     */
 +
 +    /* Have new data in Epot, xx, ff */
 +    if(ncorr<nmaxcorr)
 +      ncorr++;
 +
 +    for(i=0;i<n;i++) {
 +      dg[point][i]=lastf[i]-ff[i];
 +      dx[point][i]*=stepsize;
 +    }
 +
 +    dgdg=0;
 +    dgdx=0;
 +    for(i=0;i<n;i++) {
 +      dgdg+=dg[point][i]*dg[point][i];
 +      dgdx+=dg[point][i]*dx[point][i];
 +    }
 +
 +    diag=dgdx/dgdg;
 +
 +    rho[point]=1.0/dgdx;
 +    point++;
 +
 +    if(point>=nmaxcorr)
 +      point=0;
 +
 +    /* Update */
 +    for(i=0;i<n;i++)
 +      p[i]=ff[i];
 +
 +    cp=point;
 +
 +    /* Recursive update. First go back over the memory points */
 +    for(k=0;k<ncorr;k++) {
 +      cp--;
 +      if(cp<0)
 +      cp=ncorr-1;
 +
 +      sq=0;
 +      for(i=0;i<n;i++)
 +      sq+=dx[cp][i]*p[i];
 +
 +      alpha[cp]=rho[cp]*sq;
 +
 +      for(i=0;i<n;i++)
 +      p[i] -= alpha[cp]*dg[cp][i];
 +    }
 +
 +    for(i=0;i<n;i++)
 +      p[i] *= diag;
 +
 +    /* And then go forward again */
 +    for(k=0;k<ncorr;k++) {
 +      yr = 0;
 +      for(i=0;i<n;i++)
 +      yr += p[i]*dg[cp][i];
 +
 +      beta = rho[cp]*yr;
 +      beta = alpha[cp]-beta;
 +
 +      for(i=0;i<n;i++)
 +      p[i] += beta*dx[cp][i];
 +
 +      cp++;
 +      if(cp>=ncorr)
 +      cp=0;
 +    }
 +
 +    for(i=0;i<n;i++)
 +      if(!frozen[i])
 +      dx[point][i] = p[i];
 +      else
 +      dx[point][i] = 0;
 +
 +    stepsize=1.0;
 +
 +    /* Test whether the convergence criterion is met */
 +    get_f_norm_max(cr,&(inputrec->opts),mdatoms,f,&fnorm,&fmax,&nfmax);
 +
 +    /* Print it if necessary */
 +    if (MASTER(cr)) {
 +      if(bVerbose)
 +      fprintf(stderr,"\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +              step,Epot,fnorm/sqrt(state->natoms),fmax,nfmax+1);
 +      /* Store the new (lower) energies */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +                 mdatoms->tmass,enerd,state,inputrec->fepvals,inputrec->expandedvals,state->box,
 +                 NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +      do_log = do_per_step(step,inputrec->nstlog);
 +      do_ene = do_per_step(step,inputrec->nstenergy);
 +      if(do_log)
 +          print_ebin_header(fplog,step,step,state->lambda[efptFEP]);
 +      print_ebin(outf->fp_ene,do_ene,FALSE,FALSE,
 +               do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +
 +    /* Stop when the maximum force lies below tolerance.
 +     * If we have reached machine precision, converged is already set to true.
 +     */
 +
 +    converged = converged || (fmax < inputrec->em_tol);
 +
 +  } /* End of the loop */
 +
 +  if(converged)
 +    step--; /* we never took that last step in this case */
 +
 +    if(fmax>inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr,inputrec->em_tol,step-1==number_steps,FALSE);
 +            warn_step(fplog ,inputrec->em_tol,step-1==number_steps,FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +  /* If we printed energy and/or logfile last step (which was the last step)
 +   * we don't have to do it again, but otherwise print the final values.
 +   */
 +  if(!do_log) /* Write final value to log since we didn't do anythin last step */
 +    print_ebin_header(fplog,step,step,state->lambda[efptFEP]);
 +  if(!do_ene || !do_log) /* Write final energy file entries */
 +    print_ebin(outf->fp_ene,!do_ene,FALSE,FALSE,
 +             !do_log ? fplog : NULL,step,step,eprNORMAL,
 +             TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +
 +  /* Print some stuff... */
 +  if (MASTER(cr))
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n");
 +
 +  /* IMPORTANT!
 +   * For accurate normal mode calculation it is imperative that we
 +   * store the last conformation into the full precision binary trajectory.
 +   *
 +   * However, we should only do it if we did NOT already write this step
 +   * above (which we did if do_x or do_f was true).
 +   */
 +  do_x = !do_per_step(step,inputrec->nstxout);
 +  do_f = !do_per_step(step,inputrec->nstfout);
 +  write_em_traj(fplog,cr,outf,do_x,do_f,ftp2fn(efSTO,nfile,fnm),
 +                top_global,inputrec,step,
 +                &ems,state,f);
 +
 +  if (MASTER(cr)) {
 +    print_converged(stderr,LBFGS,inputrec->em_tol,step,converged,
 +                  number_steps,Epot,fmax,nfmax,fnorm/sqrt(state->natoms));
 +    print_converged(fplog,LBFGS,inputrec->em_tol,step,converged,
 +                  number_steps,Epot,fmax,nfmax,fnorm/sqrt(state->natoms));
 +
 +    fprintf(fplog,"\nPerformed %d energy evaluations in total.\n",neval);
 +  }
 +
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +  /* To print the actual number of steps we needed somewhere */
 +  runtime->nsteps_done = step;
 +
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_steep(FILE *fplog,t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite,gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global,t_fcdata *fcd,
 +                t_state *state_global,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                gmx_membed_t membed,
 +                real cpt_period,real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{
 +  const char *SD="Steepest Descents";
 +  em_state_t *s_min,*s_try;
 +  rvec       *f_global;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  real   stepsize,constepsize;
 +  real   ustep,dvdlambda,fnormn;
 +  gmx_mdoutf_t *outf;
 +  t_mdebin   *mdebin;
 +  gmx_bool   bDone,bAbort,do_x,do_f;
 +  tensor vir,pres;
 +  rvec   mu_tot;
 +  int    nsteps;
 +  int    count=0;
 +  int    steps_accepted=0;
 +  /* not used */
 +  real   terminate=0;
 +
 +  s_min = init_em_state();
 +  s_try = init_em_state();
 +
 +  /* Init em and store the local state in s_try */
 +  init_em(fplog,SD,cr,inputrec,
 +          state_global,top_global,s_try,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +
 +  /* Print to log file  */
 +  print_em_start(fplog,cr,runtime,wcycle,SD);
 +
 +  /* Set variables for stepsize (in nm). This is the largest
 +   * step that we are going to make in any direction.
 +   */
 +  ustep = inputrec->em_stepsize;
 +  stepsize = 0;
 +
 +  /* Max number of steps  */
 +  nsteps = inputrec->nsteps;
 +
 +  if (MASTER(cr))
 +    /* Print to the screen  */
 +    sp_header(stderr,SD,inputrec->em_tol,nsteps);
 +  if (fplog)
 +    sp_header(fplog,SD,inputrec->em_tol,nsteps);
 +
 +  /**** HERE STARTS THE LOOP ****
 +   * count is the counter for the number of steps
 +   * bDone will be TRUE when the minimization has converged
 +   * bAbort will be TRUE when nsteps steps have been performed or when
 +   * the stepsize becomes smaller than is reasonable for machine precision
 +   */
 +  count  = 0;
 +  bDone  = FALSE;
 +  bAbort = FALSE;
 +  while( !bDone && !bAbort ) {
 +    bAbort = (nsteps >= 0) && (count == nsteps);
 +
 +    /* set new coordinates, except for first step */
 +    if (count > 0) {
 +        do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,
 +                   s_min,stepsize,s_min->f,s_try,
 +                   constr,top,nrnb,wcycle,count);
 +    }
 +
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state_global,top_global,s_try,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,count,count==0);
 +
 +    if (MASTER(cr))
 +      print_ebin_header(fplog,count,count,s_try->s.lambda[efptFEP]);
 +
 +    if (count == 0)
 +      s_min->epot = s_try->epot + 1;
 +
 +    /* Print it if necessary  */
 +    if (MASTER(cr)) {
 +      if (bVerbose) {
 +      fprintf(stderr,"Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
 +              count,ustep,s_try->epot,s_try->fmax,s_try->a_fmax+1,
 +              (s_try->epot < s_min->epot) ? '\n' : '\r');
 +      }
 +
 +      if (s_try->epot < s_min->epot) {
 +      /* Store the new (lower) energies  */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)count,
 +                 mdatoms->tmass,enerd,&s_try->s,inputrec->fepvals,inputrec->expandedvals,
 +                   s_try->s.box, NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +      print_ebin(outf->fp_ene,TRUE,
 +                 do_per_step(steps_accepted,inputrec->nstdisreout),
 +                 do_per_step(steps_accepted,inputrec->nstorireout),
 +                 fplog,count,count,eprNORMAL,TRUE,
 +                 mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +      fflush(fplog);
 +      }
 +    }
 +
 +    /* Now if the new energy is smaller than the previous...
 +     * or if this is the first step!
 +     * or if we did random steps!
 +     */
 +
 +    if ( (count==0) || (s_try->epot < s_min->epot) ) {
 +      steps_accepted++;
 +
 +      /* Test whether the convergence criterion is met...  */
 +      bDone = (s_try->fmax < inputrec->em_tol);
 +
 +      /* Copy the arrays for force, positions and energy  */
 +      /* The 'Min' array always holds the coords and forces of the minimal
 +       sampled energy  */
 +      swap_em_state(s_min,s_try);
 +      if (count > 0)
 +      ustep *= 1.2;
 +
 +      /* Write to trn, if necessary */
 +      do_x = do_per_step(steps_accepted,inputrec->nstxout);
 +      do_f = do_per_step(steps_accepted,inputrec->nstfout);
 +      write_em_traj(fplog,cr,outf,do_x,do_f,NULL,
 +                    top_global,inputrec,count,
 +                    s_min,state_global,f_global);
 +    }
 +    else {
 +      /* If energy is not smaller make the step smaller...  */
 +      ustep *= 0.5;
 +
 +      if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) {
 +      /* Reload the old state */
 +      em_dd_partition_system(fplog,count,cr,top_global,inputrec,
 +                             s_min,top,mdatoms,fr,vsite,constr,
 +                             nrnb,wcycle);
 +      }
 +    }
 +
 +    /* Determine new step  */
 +    stepsize = ustep/s_min->fmax;
 +
 +    /* Check if stepsize is too small, with 1 nm as a characteristic length */
 +#ifdef GMX_DOUBLE
 +        if (count == nsteps || ustep < 1e-12)
 +#else
 +        if (count == nsteps || ustep < 1e-6)
 +#endif
 +        {
 +            if (MASTER(cr))
 +            {
 +                warn_step(stderr,inputrec->em_tol,count==nsteps,constr!=NULL);
 +                warn_step(fplog ,inputrec->em_tol,count==nsteps,constr!=NULL);
 +            }
 +            bAbort=TRUE;
 +        }
 +
 +    count++;
 +  } /* End of the loop  */
 +
 +    /* Print some shit...  */
 +  if (MASTER(cr))
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n");
 +  write_em_traj(fplog,cr,outf,TRUE,inputrec->nstfout,ftp2fn(efSTO,nfile,fnm),
 +              top_global,inputrec,count,
 +              s_min,state_global,f_global);
 +
 +  fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +  if (MASTER(cr)) {
 +    print_converged(stderr,SD,inputrec->em_tol,count,bDone,nsteps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +    print_converged(fplog,SD,inputrec->em_tol,count,bDone,nsteps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +  }
 +
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +  /* To print the actual number of steps we needed somewhere */
 +  inputrec->nsteps=count;
 +
 +  runtime->nsteps_done = count;
 +
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_nm(FILE *fplog,t_commrec *cr,
 +             int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global,t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +             gmx_membed_t membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char *NM = "Normal Mode Analysis";
 +    gmx_mdoutf_t *outf;
 +    int        natoms,atom,d;
 +    int        nnodes,node;
 +    rvec       *f_global;
 +    gmx_localtop_t *top;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f;
 +    gmx_global_stat_t gstat;
 +    t_graph    *graph;
 +    real       t,t0,lambda,lam0;
 +    gmx_bool       bNS;
 +    tensor     vir,pres;
 +    rvec       mu_tot;
 +    rvec       *fneg,*dfdx;
 +    gmx_bool       bSparse; /* use sparse matrix storage format */
 +    size_t     sz;
 +    gmx_sparsematrix_t * sparse_matrix = NULL;
 +    real *     full_matrix             = NULL;
 +    em_state_t *   state_work;
 +
 +    /* added with respect to mdrun */
 +    int        i,j,k,row,col;
 +    real       der_range=10.0*sqrt(GMX_REAL_EPS);
 +    real       x_min;
 +    real       fnorm,fmax;
 +
 +    if (constr != NULL)
 +    {
 +        gmx_fatal(FARGS,"Constraints present with Normal Mode Analysis, this combination is not supported");
 +    }
 +
 +    state_work = init_em_state();
 +
 +    /* Init em and store the local state in state_minimum */
 +    init_em(fplog,NM,cr,inputrec,
 +            state_global,top_global,state_work,&top,
 +            &f,&f_global,
 +            nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +            nfile,fnm,&outf,NULL);
 +
 +    natoms = top_global->natoms;
 +    snew(fneg,natoms);
 +    snew(dfdx,natoms);
 +
 +#ifndef GMX_DOUBLE
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "NOTE: This version of Gromacs has been compiled in single precision,\n"
 +                "      which MIGHT not be accurate enough for normal mode analysis.\n"
 +                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
 +                "      are fairly modest even if you recompile in double precision.\n\n");
 +    }
 +#endif
 +
 +    /* Check if we can/should use sparse storage format.
 +     *
 +     * Sparse format is only useful when the Hessian itself is sparse, which it
 +      * will be when we use a cutoff.
 +      * For small systems (n<1000) it is easier to always use full matrix format, though.
 +      */
 +    if(EEL_FULL(fr->eeltype) || fr->rlist==0.0)
 +    {
 +        fprintf(stderr,"Non-cutoff electrostatics used, forcing full Hessian format.\n");
 +        bSparse = FALSE;
 +    }
 +    else if(top_global->natoms < 1000)
 +    {
 +        fprintf(stderr,"Small system size (N=%d), using full Hessian format.\n",top_global->natoms);
 +        bSparse = FALSE;
 +    }
 +    else
 +    {
 +        fprintf(stderr,"Using compressed symmetric sparse Hessian format.\n");
 +        bSparse = TRUE;
 +    }
 +
 +    sz = DIM*top_global->natoms;
 +
 +    fprintf(stderr,"Allocating Hessian memory...\n\n");
 +
 +    if(bSparse)
 +    {
 +        sparse_matrix=gmx_sparsematrix_init(sz);
 +        sparse_matrix->compressed_symmetric = TRUE;
 +    }
 +    else
 +    {
 +        snew(full_matrix,sz*sz);
 +    }
 +
 +    /* Initial values */
 +    t0           = inputrec->init_t;
 +    lam0         = inputrec->fepvals->init_lambda;
 +    t            = t0;
 +    lambda       = lam0;
 +
 +    init_nrnb(nrnb);
 +
 +    where();
 +
 +    /* Write start time and temperature */
 +    print_em_start(fplog,cr,runtime,wcycle,NM);
 +
 +    /* fudge nr of steps to nr of atoms */
 +    inputrec->nsteps = natoms*2;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"starting normal mode calculation '%s'\n%d steps.\n\n",
 +                *(top_global->name),(int)inputrec->nsteps);
 +    }
 +
 +    nnodes = cr->nnodes;
 +
 +    /* Make evaluate_energy do a single node force calculation */
 +    cr->nnodes = 1;
 +    evaluate_energy(fplog,bVerbose,cr,
 +                    state_global,top_global,state_work,top,
 +                    inputrec,nrnb,wcycle,gstat,
 +                    vsite,constr,fcd,graph,mdatoms,fr,
 +                    mu_tot,enerd,vir,pres,-1,TRUE);
 +    cr->nnodes = nnodes;
 +
 +    /* if forces are not small, warn user */
 +    get_state_f_norm_max(cr,&(inputrec->opts),mdatoms,state_work);
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"Maximum force:%12.5e\n",state_work->fmax);
 +        if (state_work->fmax > 1.0e-3)
 +        {
 +            fprintf(stderr,"Maximum force probably not small enough to");
 +            fprintf(stderr," ensure that you are in an \nenergy well. ");
 +            fprintf(stderr,"Be aware that negative eigenvalues may occur");
 +            fprintf(stderr," when the\nresulting matrix is diagonalized.\n");
 +        }
 +    }
 +
 +    /***********************************************************
 +     *
 +     *      Loop over all pairs in matrix
 +     *
 +     *      do_force called twice. Once with positive and
 +     *      once with negative displacement
 +     *
 +     ************************************************************/
 +
 +    /* Steps are divided one by one over the nodes */
 +    for(atom=cr->nodeid; atom<natoms; atom+=nnodes)
 +    {
 +
 +        for (d=0; d<DIM; d++)
 +        {
 +            x_min = state_work->s.x[atom][d];
 +
 +            state_work->s.x[atom][d] = x_min - der_range;
 +
 +            /* Make evaluate_energy do a single node force calculation */
 +            cr->nnodes = 1;
 +            evaluate_energy(fplog,bVerbose,cr,
 +                            state_global,top_global,state_work,top,
 +                            inputrec,nrnb,wcycle,gstat,
 +                            vsite,constr,fcd,graph,mdatoms,fr,
 +                            mu_tot,enerd,vir,pres,atom*2,FALSE);
 +
 +            for(i=0; i<natoms; i++)
 +            {
 +                copy_rvec(state_work->f[i], fneg[i]);
 +            }
 +
 +            state_work->s.x[atom][d] = x_min + der_range;
 +
 +            evaluate_energy(fplog,bVerbose,cr,
 +                            state_global,top_global,state_work,top,
 +                            inputrec,nrnb,wcycle,gstat,
 +                            vsite,constr,fcd,graph,mdatoms,fr,
 +                            mu_tot,enerd,vir,pres,atom*2+1,FALSE);
 +            cr->nnodes = nnodes;
 +
 +            /* x is restored to original */
 +            state_work->s.x[atom][d] = x_min;
 +
 +            for(j=0; j<natoms; j++)
 +            {
 +                for (k=0; (k<DIM); k++)
 +                {
 +                    dfdx[j][k] =
 +                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
 +                }
 +            }
 +
 +            if (!MASTER(cr))
 +            {
 +#ifdef GMX_MPI
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +                MPI_Send(dfdx[0],natoms*DIM,mpi_type,MASTERNODE(cr),cr->nodeid,
 +                         cr->mpi_comm_mygroup);
 +#endif
 +            }
 +            else
 +            {
 +                for(node=0; (node<nnodes && atom+node<natoms); node++)
 +                {
 +                    if (node > 0)
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Status stat;
 +                        MPI_Recv(dfdx[0],natoms*DIM,mpi_type,node,node,
 +                                 cr->mpi_comm_mygroup,&stat);
 +#undef mpi_type
 +#endif
 +                    }
 +
 +                    row = (atom + node)*DIM + d;
 +
 +                    for(j=0; j<natoms; j++)
 +                    {
 +                        for(k=0; k<DIM; k++)
 +                        {
 +                            col = j*DIM + k;
 +
 +                            if (bSparse)
 +                            {
 +                                if (col >= row && dfdx[j][k] != 0.0)
 +                                {
 +                                    gmx_sparsematrix_increment_value(sparse_matrix,
 +                                                                     row,col,dfdx[j][k]);
 +                                }
 +                            }
 +                            else
 +                            {
 +                                full_matrix[row*sz+col] = dfdx[j][k];
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            if (bVerbose && fplog)
 +            {
 +                fflush(fplog);
 +            }
 +        }
 +        /* write progress */
 +        if (MASTER(cr) && bVerbose)
 +        {
 +            fprintf(stderr,"\rFinished step %d out of %d",
 +                    min(atom+nnodes,natoms),natoms);
 +            fflush(stderr);
 +        }
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n\nWriting Hessian...\n");
 +        gmx_mtxio_write(ftp2fn(efMTX,nfile,fnm),sz,sz,full_matrix,sparse_matrix);
 +    }
 +
 +    finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +    runtime->nsteps_done = natoms*2;
 +
 +    return 0;
 +}
 +
index 09e53c24b3425da48727e3677f425b92100f16a1,0000000000000000000000000000000000000000..27ac6045c005580d195faf174e665b795d2abdf8
mode 100644,000000..100644
--- /dev/null
@@@ -1,671 -1,0 +1,673 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#if defined(_MSVC)
 +#include <limits>
 +#endif
 +
 +#include "types/simple.h" 
 +#include "types/nbnxn_pairlist.h"
 +#include "types/nb_verlet.h"
 +#include "types/ishift.h"
 +#include "types/force_flags.h"
 +#include "../nbnxn_consts.h"
 +
 +#ifdef TMPI_ATOMICS
 +#include "thread_mpi/atomic.h"
 +#endif
 +
 +#include "nbnxn_cuda_types.h"
 +#include "../../gmxlib/cuda_tools/cudautils.cuh"
 +#include "nbnxn_cuda.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +
 +/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
 +texture<float, 1, cudaReadModeElementType> tex_nbfp;
 +
 +/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
 +texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
 +
 +/* Convenience defines */
 +#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +
 +/***** The kernels come here *****/
 +#include "nbnxn_cuda_kernel_utils.cuh"
 +
 +/* Generate all combinations of kernels through multiple inclusion:
 +   F, F + E, F + prune, F + E + prune. */
 +/** Force only **/
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +
 +/*** Pair-list pruning kernels ***/
 +/** Force only **/
 +#define PRUNE_NBL
 +#include "nbnxn_cuda_kernels.cuh"
 +/** Force & energy **/
 +#define CALC_ENERGIES
 +#include "nbnxn_cuda_kernels.cuh"
 +#undef CALC_ENERGIES
 +#undef PRUNE_NBL
 +
 +/*! Nonbonded kernel function pointer type */
 +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 +                                     const cu_nbparam_t,
 +                                     const cu_plist_t,
 +                                     bool);
 +
 +/*********************************/
 +
 +/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 +static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 +static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 +static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 +
 +
 +/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 + * and corresponds to having the exponent set to the maximum (127 -- single
 + * precision) and the mantissa to 0.
 + */
 +static unsigned int poll_wait_pattern = (0x7FU << 23);
 +
 +/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 +static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 +{
 +    int max_grid_x_size;
 +
 +    assert(dinfo);
 +
 +    max_grid_x_size = dinfo->prop.maxGridSize[0];
 +
 +    /* do we exceed the grid x dimension limit? */
 +    if (nwork_units > max_grid_x_size)
 +    {
 +        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 +                  "The number of nonbonded work units (=number of super-clusters) exceeds the"
 +                  "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 +    }
 +
 +    return nwork_units;
 +}
 +
 +
 +/* Constant arrays listing all kernel function pointers and enabling selection
 +   of a kernel in an elegant manner. */
 +
 +static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 +static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 +
 +/* Default kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 +      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 +    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 +      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 +    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 +      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 +    { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 +      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 +};
 +
 +/* Legacy kernels */
 +static const nbnxn_cu_kfunc_ptr_t
 +nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 +{
 +    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 +      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 +    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 +      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 +    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 +      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 +    { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 +      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 +};
 +
 +/*! Return a pointer to the kernel version to be executed at the current step. */
 +static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 +                                                       bool bDoEne, bool bDoPrune)
 +{
 +    assert(kver < eNbnxnCuKNR);
 +    assert(eeltype < eelCuNR);
 +
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +    else
 +    {
 +        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 +    }
 +}
 +
 +/*! Calculates the amount of shared memory required for kernel version in use. */
 +static inline int calc_shmem_required(int kver)
 +{
 +    int shmem;
 +
 +    /* size of shmem (force-buffers/xq/atom type preloading) */
 +    if (NBNXN_KVER_LEGACY(kver))
 +    {
 +        /* i-atom x+q in shared memory */
 +        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +    }
 +    else
 +    {
 +        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 +        /* i-atom x+q in shared memory */
 +        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
++        /* cj in shared memory, for both warps separately */
++        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 +#ifdef IATYPE_SHMEM
 +        /* i-atom types in shared memory */
 +        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 +#endif
 +#if __CUDA_ARCH__ < 300
 +        /* force reduction buffers in shared memory */
 +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 +#endif
 +    }
 +
 +    return shmem;
 +}
 +
 +/*! As we execute nonbonded workload in separate streams, before launching 
 +   the kernel we need to make sure that he following operations have completed:
 +   - atomdata allocation and related H2D transfers (every nstlist step);
 +   - pair list H2D transfer (every nstlist step);
 +   - shift vector H2D transfer (every nstlist step);
 +   - force (+shift force and energy) output clearing (every step).
 +
 +   These operations are issued in the local stream at the beginning of the step
 +   and therefore always complete before the local kernel launch. The non-local
 +   kernel is launched after the local on the same device/context, so this is
 +   inherently scheduled after the operations in the local stream (including the
 +   above "misc_ops").
 +   However, for the sake of having a future-proof implementation, we use the
 +   misc_ops_done event to record the point in time when the above  operations
 +   are finished and synchronize with this event in the non-local stream.
 +*/
 +void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_atomdata_t *nbatom,
 +                              int flags,
 +                              int iloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 +    /* CUDA kernel launch-related stuff */
 +    int  shmem, nblock;
 +    dim3 dim_block, dim_grid;
 +    nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_nbparam_t    *nbp    = cu_nb->nbparam;
 +    cu_plist_t      *plist  = cu_nb->plist[iloc];
 +    cu_timers_t     *t      = cu_nb->timers;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +    bool bDoTime     = cu_nb->bDoTime;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener;
 +
 +    /* don't launch the kernel if there is no work to do */
 +    if (plist->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_I(iloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +    }
 +
 +    /* When we get here all misc operations issues in the local stream are done,
 +       so we record that in the local stream and wait for it in the nonlocal one. */
 +    if (cu_nb->bUseTwoStreams)
 +    {
 +        if (iloc == eintLocal)
 +        {
 +            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 +            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 +        }
 +        else
 +        {
 +            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 +            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 +        }
 +    }
 +
 +    /* beginning of timed HtoD section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* HtoD x, q */
 +    cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 +                      adat_len * sizeof(*adat->xq), stream); 
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* beginning of timed nonbonded calculation section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* get the pointer to the kernel flavor we need to use */
 +    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 +                                    plist->bDoPrune || always_prune);
 +
 +    /* kernel launch config */
 +    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 +    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 +    dim_grid  = dim3(nblock, 1, 1);
 +    shmem     = calc_shmem_required(cu_nb->kernel_ver);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 +                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 +                dim_block.x, dim_block.y, dim_block.z,
 +                dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 +                NCL_PER_SUPERCL, plist->na_c);
 +    }
 +
 +    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 +    CU_LAUNCH_ERR("k_calc_nb");
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 +                               const nbnxn_atomdata_t *nbatom,
 +                               int flags,
 +                               int aloc)
 +{
 +    cudaError_t stat;
 +    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 +    int iloc = -1;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_atomdata_t   *adat   = cu_nb->atdat;
 +    cu_timers_t     *t      = cu_nb->timers;
 +    bool            bDoTime = cu_nb->bDoTime;
 +    cudaStream_t    stream  = cu_nb->stream[iloc];
 +
 +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* don't launch copy-back if there was no work to do */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_begin  = 0;
 +        adat_len    = adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_begin  = adat->natoms_local;
 +        adat_len    = adat->natoms - adat->natoms_local;
 +        adat_end    = cu_nb->atdat->natoms;
 +    }
 +
 +    /* beginning of timed D2H section */
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    if (!cu_nb->bUseStreamSync)
 +    {
 +        /* For safety reasons set a few (5%) forces to NaN. This way even if the
 +           polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 +        for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 +        {
 +#ifdef NAN
 +            nbatom->out[0].f[i] = NAN;
 +#else
 +#  ifdef _MSVC
 +            if (numeric_limits<float>::has_quiet_NaN)
 +            {
 +                nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 +            }
 +            else
 +#  endif
 +            {
 +                nbatom->out[0].f[i] = GMX_REAL_MAX;
 +            }
 +#endif
 +        }
 +
 +        /* Set the last four bytes of the force array to a bit pattern
 +           which can't be the result of the force calculation:
 +           max exponent (127) and zero mantissa. */
 +        *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 +    }
 +
 +    /* With DD the local D2H transfer can only start after the non-local 
 +       has been launched. */
 +    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 +    {
 +        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 +        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 +    }
 +
 +    /* DtoH f */
 +    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
 +                      (adat_len)*sizeof(*adat->f), stream);
 +
 +    /* After the non-local D2H is launched the nonlocal_done event can be
 +       recorded which signals that the local D2H can proceed. This event is not
 +       placed after the non-local kernel because we first need the non-local
 +       data back first. */
 +    if (iloc == eintNonlocal)
 +    {
 +        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 +        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 +    }
 +
 +    /* only transfer energies in the local stream */
 +    if (LOCAL_I(iloc))
 +    {
 +        /* DtoH fshift */
 +        if (bCalcFshift)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 +                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 +        }
 +
 +        /* DtoH energies */
 +        if (bCalcEner)
 +        {
 +            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 +                              sizeof(*cu_nb->nbst.e_lj), stream);
 +            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 +                              sizeof(*cu_nb->nbst.e_el), stream);
 +        }
 +    }
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +/* Atomic compare-exchange operation on unsigned values. It is used in
 + * polling wait for the GPU.
 + */
 +static inline bool atomic_cas(volatile unsigned int *ptr,
 +                              unsigned int oldval,
 +                              unsigned int newval)
 +{
 +    assert(ptr);
 +
 +#ifdef TMPI_ATOMICS
 +    return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 +#else
 +    gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 +    return true;
 +#endif
 +}
 +
 +void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 +                         const nbnxn_atomdata_t *nbatom,
 +                         int flags, int aloc,
 +                         float *e_lj, float *e_el, rvec *fshift)
 +{
 +    cudaError_t stat;
 +    int i, adat_end, iloc = -1;
 +    volatile unsigned int *poll_word;
 +
 +    /* determine interaction locality from atom locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        iloc = eintLocal;
 +    }
 +    else if (NONLOCAL_A(aloc))
 +    {
 +        iloc = eintNonlocal;
 +    }
 +    else
 +    {
 +        char stmp[STRLEN];
 +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 +        gmx_incons(stmp);
 +    }
 +
 +    cu_plist_t      *plist   = cu_nb->plist[iloc];
 +    cu_timers_t     *timers  = cu_nb->timers;
 +    wallclock_gpu_t *timings = cu_nb->timings;
 +    nb_staging      nbst     = cu_nb->nbst;
 +
 +    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 +    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 +
 +    /* turn energy calculation always on/off (for debugging/testing only) */
 +    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
 +
 +    /* don't launch wait/update timers & counters if there was no work to do
 +
 +       NOTE: if timing with multiple GPUs (streams) becomes possible, the
 +       counters could end up being inconsistent due to not being incremented
 +       on some of the nodes! */
 +    if (cu_nb->plist[iloc]->nsci == 0)
 +    {
 +        return;
 +    }
 +
 +    /* calculate the atom data index range based on locality */
 +    if (LOCAL_A(aloc))
 +    {
 +        adat_end = cu_nb->atdat->natoms_local;
 +    }
 +    else
 +    {
 +        adat_end = cu_nb->atdat->natoms;
 +    }
 +
 +    if (cu_nb->bUseStreamSync)
 +    {
 +        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 +        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 +    }
 +    else 
 +    {
 +        /* Busy-wait until we get the signal pattern set in last byte
 +         * of the l/nl float vector. This pattern corresponds to a floating
 +         * point number which can't be the result of the force calculation
 +         * (maximum, 127 exponent and 0 mantissa).
 +         * The polling uses atomic compare-exchange.
 +         */
 +        poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 +        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 +    }
 +
 +    /* timing data accumulation */
 +    if (cu_nb->bDoTime)
 +    {
 +        /* only increase counter once (at local F wait) */
 +        if (LOCAL_I(iloc))
 +        {
 +            timings->nb_c++;
 +            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 +        }
 +
 +        /* kernel timings */
 +        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 +            cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 +
 +        /* X/q H2D and F D2H timings */
 +        timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 +                                                 timers->stop_nb_h2d[iloc]);
 +        timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 +                                                 timers->stop_nb_d2h[iloc]);
 +
 +        /* only count atdat and pair-list H2D at pair-search step */
 +        if (plist->bDoPrune)
 +        {
 +            /* atdat transfer timing (add only once, at local F wait) */
 +            if (LOCAL_A(aloc))
 +            {
 +                timings->pl_h2d_c++;
 +                timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 +                                                         timers->stop_atdat);
 +            }
 +
 +            timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 +                                                     timers->stop_pl_h2d[iloc]);
 +        }
 +    }
 +
 +    /* add up energies and shift forces (only once at local F wait) */
 +    if (LOCAL_I(iloc))
 +    {
 +        if (bCalcEner)
 +        {
 +            *e_lj += *nbst.e_lj;
 +            *e_el += *nbst.e_el;
 +        }
 +
 +        if (bCalcFshift)
 +        {
 +            for (i = 0; i < SHIFTS; i++)
 +            {
 +                fshift[i][0] += nbst.fshift[i].x;
 +                fshift[i][1] += nbst.fshift[i].y;
 +                fshift[i][2] += nbst.fshift[i].z;
 +            }
 +        }
 +    }
 +
 +    /* turn off pruning (doesn't matter if this is pair-search step or not) */
 +    plist->bDoPrune = false;
 +}
 +
 +/*! Return the reference to the nbfp texture. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 +{
 +    return tex_nbfp;
 +}
 +
 +/*! Return the reference to the coulomb_tab. */
 +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 +{
 +    return tex_coulomb_tab;
 +}
 +
 +/*! Set up the cache configuration for the non-bonded kernels,
 + */
 +void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 +{
 +    cudaError_t stat;
 +
 +    for (int i = 0; i < eelCuNR; i++)
 +        for (int j = 0; j < nEnergyKernelTypes; j++)
 +            for (int k = 0; k < nPruneKernelTypes; k++)
 +            {
 +                /* Legacy kernel 16/48 kB Shared/L1 */
 +                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +
 +                if (devinfo->prop.major >= 3)
 +                {
 +                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 +                }
 +                else
 +                {
 +                    /* On Fermi prefer L1 gives 2% higher performance */
 +                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 +                }
 +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 +            }
 +}
index a323b95c3e7ffab9ebe10e3d1fbc9346b5d2958f,0000000000000000000000000000000000000000..bea220b4c5d90a75ae170b40a6142ade326e6647
mode 100644,000000..100644
--- /dev/null
@@@ -1,884 -1,0 +1,895 @@@
-     bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdlib.h>
 +#include <stdio.h>
 +#include <assert.h>
 +
 +#include "gmx_fatal.h"
 +#include "smalloc.h"
 +#include "tables.h"
 +#include "typedefs.h"
 +#include "types/nb_verlet.h"
 +#include "types/interaction_const.h"
 +#include "types/force_flags.h"
 +#include "../nbnxn_consts.h"
 +
 +#include "nbnxn_cuda_types.h"
 +#include "../../gmxlib/cuda_tools/cudautils.cuh"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +#include "gpu_utils.h"
 +
 +static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
 +
 +/* This is a heuristically determined parameter for the Fermi architecture for
 + * the minimum size of ci lists by multiplying this constant with the # of
 + * multiprocessors on the current device.
 + */
 +static unsigned int gpu_min_ci_balanced_factor = 40;
 +
 +/* Functions from nbnxn_cuda.cu */
 +extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
 +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
 +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
 +
++/* We should actually be using md_print_warn in md_logging.c,
++ * but we can't include mpi.h in CUDA code.
++ */
++static void md_print_warn(FILE *fplog, const char *buf)
++{
++    if (fplog != NULL)
++    {
++        /* We should only print to stderr on the master node,
++         * in most cases fplog is only set on the master node, so this works.
++         */
++        fprintf(stderr, "\n%s\n", buf);
++        fprintf(fplog,  "\n%s\n", buf);
++    }
++}
++
 +/* Fw. decl. */
 +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
 +
 +
 +/*! Tabulates the Ewald Coulomb force and initializes the size/scale
 +    and the table GPU array. If called with an already allocated table,
 +    it just re-uploads the table.
 + */
 +static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
 +{
 +    float       *ftmp, *coul_tab;
 +    int         tabsize;
 +    double      tabscale;
 +    cudaError_t stat;
 +
 +    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
 +
 +    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
 +
 +    table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
 +                                1/tabscale, nbp->ewald_beta);
 +
 +    /* If the table pointer == NULL the table is generated the first time =>
 +       the array pointer will be saved to nbparam and the texture is bound.
 +     */
 +    coul_tab = nbp->coulomb_tab;
 +    if (coul_tab == NULL)
 +    {
 +        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
 +        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 +
 +        nbp->coulomb_tab = coul_tab;
 +
 +        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 +        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 +                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
 +        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
 +    }
 +
 +    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
 +
 +    nbp->coulomb_tab_size     = tabsize;
 +    nbp->coulomb_tab_scale    = tabscale;
 +
 +    pfree(ftmp);
 +}
 +
 +
 +/*! Initializes the atomdata structure first time, it only gets filled at
 +    pair-search. */
 +static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 +{
 +    cudaError_t stat;
 +
 +    ad->ntypes  = ntypes;
 +    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 +    ad->bShiftVecUploaded = false;
 +
 +    stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 +
 +    stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 +    stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 +
 +    /* initialize to NULL poiters to data that is not allocated here and will
 +       need reallocation in nbnxn_cuda_init_atomdata */
 +    ad->xq = NULL;
 +    ad->f  = NULL;
 +
 +    /* size -1 indicates that the respective array hasn't been initialized yet */
 +    ad->natoms = -1;
 +    ad->nalloc = -1;
 +}
 +
 +/*! Initializes the nonbonded parameter data structure. */
 +static void init_nbparam(cu_nbparam_t *nbp,
 +                         const interaction_const_t *ic,
 +                         const nonbonded_verlet_t *nbv)
 +{
 +    cudaError_t stat;
 +    int         ntypes, nnbfp;
 +
 +    ntypes  = nbv->grp[0].nbat->ntype;
 +
 +    nbp->ewald_beta = ic->ewaldcoeff;
 +    nbp->sh_ewald   = ic->sh_ewald;
 +    nbp->epsfac     = ic->epsfac;
 +    nbp->two_k_rf   = 2.0 * ic->k_rf;
 +    nbp->c_rf       = ic->c_rf;
 +    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
 +    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
 +    nbp->rlist_sq   = ic->rlist * ic->rlist;
 +    nbp->sh_invrc6  = ic->sh_invrc6;
 +
 +    if (ic->eeltype == eelCUT)
 +    {
 +        nbp->eeltype = eelCuCUT;
 +    }
 +    else if (EEL_RF(ic->eeltype))
 +    {
 +        nbp->eeltype = eelCuRF;
 +    }
 +    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
 +    {
 +        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
 +           forced by the env. var. (used only for benchmarking). */
 +        if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
 +        {
 +            nbp->eeltype = eelCuEWALD;
 +        }
 +        else
 +        {
 +            nbp->eeltype = eelCuEWALD_TWIN;
 +        }
 +    }
 +    else
 +    {
 +        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 +        gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 +    }
 +
 +    /* generate table for PME */
 +    if (nbp->eeltype == eelCuEWALD)
 +    {
 +        nbp->coulomb_tab = NULL;
 +        init_ewald_coulomb_force_table(nbp);
 +    }
 +
 +    nnbfp = 2*ntypes*ntypes;
 +    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 +    CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 +    cu_copy_H2D(nbp->nbfp, nbv->grp[0].nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 +
 +    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 +    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 +                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 +    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
 +}
 +
 +/*! Re-generate the GPU Ewald force table, resets rlist, and update the
 + *  electrostatic type switching to twin cut-off (or back) if needed. */
 +void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
 +                                         const interaction_const_t *ic)
 +{
 +    cu_nbparam_t *nbp = cu_nb->nbparam;
 +
 +    nbp->rlist_sq       = ic->rlist * ic->rlist;
 +    nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
 +    nbp->ewald_beta     = ic->ewaldcoeff;
 +
 +    /* When switching to/from twin cut-off, the electrostatics type needs updating.
 +       (The env. var. that forces twin cut-off is for benchmarking only!) */
 +    if (ic->rcoulomb == ic->rvdw &&
 +        getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
 +    {
 +        nbp->eeltype = eelCuEWALD;
 +    }
 +    else
 +    {
 +        nbp->eeltype = eelCuEWALD_TWIN;
 +    }
 +
 +    init_ewald_coulomb_force_table(cu_nb->nbparam);
 +}
 +
 +/*! Initializes the pair list data structure. */
 +static void init_plist(cu_plist_t *pl)
 +{
 +    /* initialize to NULL pointers to data that is not allocated here and will
 +       need reallocation in nbnxn_cuda_init_pairlist */
 +    pl->sci     = NULL;
 +    pl->cj4     = NULL;
 +    pl->excl    = NULL;
 +
 +    /* size -1 indicates that the respective array hasn't been initialized yet */
 +    pl->na_c        = -1;
 +    pl->nsci        = -1;
 +    pl->sci_nalloc  = -1;
 +    pl->ncj4        = -1;
 +    pl->cj4_nalloc  = -1;
 +    pl->nexcl       = -1;
 +    pl->excl_nalloc = -1;
 +    pl->bDoPrune    = false;
 +}
 +
 +/*! Initializes the timer data structure. */
 +static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 +{
 +    cudaError_t stat;
 +    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
 +
 +    stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 +    CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 +    stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 +    CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 +
 +    /* The non-local counters/stream (second in the array) are needed only with DD. */
 +    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 +    {
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 +
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 +
 +        stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 +        stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 +    }
 +}
 +
 +/*! Initializes the timings data structure. */
 +static void init_timings(wallclock_gpu_t *t)
 +{
 +    int i, j;
 +
 +    t->nb_h2d_t = 0.0;
 +    t->nb_d2h_t = 0.0;
 +    t->nb_c    = 0;
 +    t->pl_h2d_t = 0.0;
 +    t->pl_h2d_c = 0;
 +    for (i = 0; i < 2; i++)
 +    {
 +        for(j = 0; j < 2; j++)
 +        {
 +            t->ktime[i][j].t = 0.0;
 +            t->ktime[i][j].c = 0;
 +        }
 +    }
 +}
 +
 +/* Decide which kernel version to use (default or legacy) based on:
 + *  - CUDA version
 + *  - non-bonded kernel selector environment variables
 + *  - GPU SM version TODO ???
 + */
 +static int pick_nbnxn_kernel_version()
 +{
 +    bool bLegacyKernel, bDefaultKernel, bCUDA40, bCUDA32;
 +    char sbuf[STRLEN];
 +    int  kver;
 +
 +    /* legacy kernel (former k2), kept for now for backward compatibility,
 +       faster than the default with  CUDA 3.2/4.0 (TODO: on Kepler?). */
 +    bLegacyKernel  = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
 +    /* default kernel (former k3). */
 +    bDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
 +
 +    if ((unsigned)(bLegacyKernel + bDefaultKernel) > 1)
 +    {
 +        gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
 +                  "of the following environment variables: \n"
 +                  "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
 +    }
 +
 +    bCUDA32 = bCUDA40 = false;
 +#if CUDA_VERSION == 3200
 +    bCUDA32 = true;
 +    sprintf(sbuf, "3.2");
 +#elif CUDA_VERSION == 4000
 +    bCUDA40 = true;
 +    sprintf(sbuf, "4.0");
 +#endif
 +
 +    /* default is default ;) */
 +    kver = eNbnxnCuKDefault;
 +
 +    if (bCUDA32 || bCUDA40)
 +    {
 +        /* use legacy kernel unless something else is forced by an env. var */
 +        if (bDefaultKernel)
 +        {
 +            fprintf(stderr,
 +                    "\nNOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
 +                    "      non-bonded kernels perform best. However, the default kernels were\n"
 +                    "      selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
 +                    "      For best performance upgrade your CUDA toolkit.",
 +                    sbuf);
 +        }
 +        else
 +        {
 +            kver = eNbnxnCuKLegacy;
 +        }
 +    }
 +    else
 +    {
 +        /* issue not if the non-default kernel is forced by an env. var */
 +        if (bLegacyKernel)
 +        {
 +            fprintf(stderr,
 +                    "\nNOTE: Legacy non-bonded CUDA kernels were selected by the GMX_CUDA_NB_LEGACY\n"
 +                    "      env. var. Consider using using the default kernels which should be faster!\n");
 +
 +            kver = eNbnxnCuKLegacy;
 +        }
 +    }
 +
 +    return kver;
 +}
 +
 +void nbnxn_cuda_init(FILE *fplog,
 +                     nbnxn_cuda_ptr_t *p_cu_nb,
 +                     gmx_gpu_info_t *gpu_info, int my_gpu_index,
 +                     gmx_bool bLocalAndNonlocal)
 +{
 +    cudaError_t stat;
 +    nbnxn_cuda_ptr_t  nb;
 +    char sbuf[STRLEN];
-             sprintf(sbuf,
-                     "NOTE: Using a GPU with ECC enabled, but cudaStreamSynchronize-based waiting is\n"
-                     "      forced by the GMX_CUDA_STREAMSYNC env. var. Due to a CUDA bug, this \n"
-                     "      combination causes performance loss.");
-             fprintf(stderr, "\n%s\n", sbuf);
-             if (fplog)
++    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
++    int cuda_drv_ver;
 +
 +    assert(gpu_info);
 +
 +    if (p_cu_nb == NULL) return;
 +
 +    snew(nb, 1);
 +    snew(nb->atdat, 1);
 +    snew(nb->nbparam, 1);
 +    snew(nb->plist[eintLocal], 1);
 +    if (bLocalAndNonlocal)
 +    {
 +        snew(nb->plist[eintNonlocal], 1);
 +    }
 +
 +    nb->bUseTwoStreams = bLocalAndNonlocal;
 +
 +    snew(nb->timers, 1);
 +    snew(nb->timings, 1);
 +
 +    /* init nbst */
 +    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 +    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 +    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 +
 +    init_plist(nb->plist[eintLocal]);
 +
 +    /* local/non-local GPU streams */
 +    stat = cudaStreamCreate(&nb->stream[eintLocal]);
 +    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 +    if (nb->bUseTwoStreams)
 +    {
 +        init_plist(nb->plist[eintNonlocal]);
 +        stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 +        CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 +    }
 +
 +    /* init events for sychronization (timing disabled for performance reasons!) */
 +    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 +    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 +    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 +    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 +
 +    /* set device info, just point it to the right GPU among the detected ones */
 +    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
 +
 +    /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 +     * (which increases with shorter time/step) caused by a known CUDA driver bug.
 +     * To work around the issue we'll use an (admittedly fragile) memory polling
 +     * waiting to preserve performance. This requires support for atomic
 +     * operations and only works on x86/x86_64.
 +     * With polling wait event-timing also needs to be disabled.
++     *
++     * The overhead is greatly reduced in API v5.0 drivers and the improvement
++     $ is independent of runtime version. Hence, with API v5.0 drivers and later
++     * we won't switch to polling.
++     *
++     * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
++     * ranks so we will also disable it in that case.
 +     */
 +
 +    bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 +    bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 +
 +#ifdef TMPI_ATOMICS
 +    bTMPIAtomics = true;
 +#else
 +    bTMPIAtomics = false;
 +#endif
 +
 +#if defined(i386) || defined(__x86_64__)
 +    bX86 = true;
 +#else
 +    bX86 = false;
 +#endif
 +
 +    if (bStreamSync && bNoStreamSync)
 +    {
 +        gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 +    }
 +
++    stat = cudaDriverGetVersion(&cuda_drv_ver);
++    CU_RET_ERR(stat, "cudaDriverGetVersion failed");
++    bOldDriver = (cuda_drv_ver < 5000);
++
 +    if (nb->dev_info->prop.ECCEnabled == 1)
 +    {
 +        if (bStreamSync)
 +        {
 +            nb->bUseStreamSync = true;
 +
-                 fprintf(fplog, "\n%s\n", sbuf);
++            /* only warn if polling should be used */
++            if (bOldDriver && !gpu_info->bDevShare)
 +            {
-             /* can use polling wait only on x86/x86_64 *if* atomics are available */
-             nb->bUseStreamSync = ((bX86 && bTMPIAtomics) == false);
-             if (!bX86)
++                md_print_warn(fplog,
++                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
++                              "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 +            }
 +        }
 +        else
 +        {
-                 sprintf(sbuf,
-                         "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
-                         "      CUDA bug, causes performance loss when used in combination with ECC.\n"
-                         "      However, the polling waiting workaround can not be used as it is only\n"
-                         "      supported on x86/x86_64, but not on the current architecture.");
-                 gmx_warning("%s\n", sbuf);
-                 if (fplog)
-                 {
-                     fprintf(fplog, "\n%s\n", sbuf);
-                 }
++            /* Can/should turn of cudaStreamSynchronize wait only if
++             *   - we're on x86/x86_64
++             *   - atomics are available
++             *   - GPUs are not being shared
++             *   - and driver is old. */
++            nb->bUseStreamSync =
++                (bX86 && bTMPIAtomics && !gpu_info->bDevShare && bOldDriver) ?
++                true : false;
++
++            if (nb->bUseStreamSync)
 +            {
-             else if (bTMPIAtomics)
-             {
-                 if (fplog)
-                 {
-                     fprintf(fplog,
-                             "NOTE: Using a GPU with ECC enabled; will use polling waiting.\n");
-                 }
-             }
-             else
++                md_print_warn(fplog,
++                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
++                              "      cause performance loss. Switching to the alternative polling GPU waiting.\n"
++                              "      If you encounter issues, switch back to standard GPU waiting by setting\n"
++                              "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 +            }
-                         "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
-                         "      CUDA bug, causes performance loss when used in combination with ECC.\n"
-                         "      However, the polling waiting workaround can not be used as atomic\n"
-                         "      operations are not supported by the current CPU+compiler combination.");
-                 gmx_warning("%s\n", sbuf);
-                 if (fplog)
-                 {
-                     fprintf(fplog, "\n%s\n", sbuf);
-                 }
++            else if (bOldDriver)
 +            {
++                /* Tell the user that the ECC+old driver combination can be bad */
 +                sprintf(sbuf,
-             sprintf(sbuf,
-                     "NOTE: Using a GPU with no/disabled ECC, but cudaStreamSynchronize-based waiting\n"
-                     "      is turned off and polling turned on by the GMX_NO_CUDA_STREAMSYNC env. var.");
-             fprintf(stderr, "\n%s\n", sbuf);
-             if (fplog)
-             {
-                 fprintf(fplog, "\n%s\n", sbuf);
-             }
++                        "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0. A bug in this\n"
++                        "      driver can cause performance loss.\n"
++                        "      However, the polling waiting workaround can not be used because\n%s\n"
++                        "      Consider updating the driver or turning ECC off.",
++                        (!bX86 || !bTMPIAtomics) ?
++                           "         atomic operations are not supported by the platform/CPU+compiler." :
++                           "         GPU(s) are being oversubscribed.");
++                md_print_warn(fplog, sbuf);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        if (bNoStreamSync)
 +        {
 +            nb->bUseStreamSync = false;
 +
++            md_print_warn(fplog,
++                          "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 +        }
 +        else
 +        {
 +            /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 +            nb->bUseStreamSync = true;
 +        }
 +    }
 +
 +    /* CUDA timing disabled as event timers don't work:
 +       - with multiple streams = domain-decomposition;
 +       - with the polling waiting hack (without cudaStreamSynchronize);
 +       - when turned off by GMX_DISABLE_CUDA_TIMING.
 +     */
 +    nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 +                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 +
 +    if (nb->bDoTime)
 +    {
 +        init_timers(nb->timers, nb->bUseTwoStreams);
 +        init_timings(nb->timings);
 +    }
 +
 +    /* set the kernel type for the current GPU */
 +    nb->kernel_ver = pick_nbnxn_kernel_version();
 +    /* pick L1 cache configuration */
 +    nbnxn_cuda_set_cacheconfig(nb->dev_info);
 +
 +    *p_cu_nb = nb;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Initialized CUDA data structures.\n");
 +    }
 +}
 +
 +void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
 +                           const interaction_const_t *ic,
 +                           const nonbonded_verlet_t *nbv)
 +{
 +    init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
 +    init_nbparam(cu_nb->nbparam, ic, nbv);
 +
 +    /* clear energy and shift force outputs */
 +    nbnxn_cuda_clear_e_fshift(cu_nb);
 +}
 +
 +void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_pairlist_t *h_plist,
 +                              int iloc)
 +{
 +    char         sbuf[STRLEN];
 +    cudaError_t  stat;
 +    bool         bDoTime    = cu_nb->bDoTime;
 +    cudaStream_t stream     = cu_nb->stream[iloc];
 +    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
 +
 +    if (d_plist->na_c < 0)
 +    {
 +        d_plist->na_c = h_plist->na_ci;
 +    }
 +    else
 +    {
 +        if (d_plist->na_c != h_plist->na_ci)
 +        {
 +            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 +                    d_plist->na_c, h_plist->na_ci);
 +            gmx_incons(sbuf);
 +        }
 +    }
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 +                         &d_plist->nsci, &d_plist->sci_nalloc,
 +                         h_plist->nsci,
 +                         stream, true);
 +
 +    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 +                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 +                         h_plist->ncj4,
 +                         stream, true);
 +
 +    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 +                         &d_plist->nexcl, &d_plist->excl_nalloc,
 +                         h_plist->nexcl,
 +                         stream, true);
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* need to prune the pair list during the next step */
 +    d_plist->bDoPrune = true;
 +}
 +
 +void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
 +                                const nbnxn_atomdata_t *nbatom)
 +{
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    /* only if we have a dynamic box */
 +    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 +    {
 +        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
 +                          SHIFTS * sizeof(*adat->shift_vec), ls);
 +        adat->bShiftVecUploaded = true;
 +    }
 +}
 +
 +/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 +static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 +{
 +    cudaError_t   stat;
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 +}
 +
 +/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    cudaError_t   stat;
 +    cu_atomdata_t *adat = cu_nb->atdat;
 +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
 +
 +    stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 +    stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 +    stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 +    CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 +}
 +
 +void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
 +{
 +    nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
 +    /* clear shift force array and energies if the outputs were 
 +       used in the current step */
 +    if (flags & GMX_FORCE_VIRIAL)
 +    {
 +        nbnxn_cuda_clear_e_fshift(cu_nb);
 +    }
 +}
 +
 +void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
 +                              const nbnxn_atomdata_t *nbat)
 +{
 +    cudaError_t   stat;
 +    int           nalloc, natoms;
 +    bool          realloced;
 +    bool          bDoTime   = cu_nb->bDoTime;
 +    cu_timers_t   *timers   = cu_nb->timers;
 +    cu_atomdata_t *d_atdat  = cu_nb->atdat;
 +    cudaStream_t  ls        = cu_nb->stream[eintLocal];
 +
 +    natoms = nbat->natoms;
 +    realloced = false;
 +
 +    if (bDoTime)
 +    {
 +        /* time async copy */
 +        stat = cudaEventRecord(timers->start_atdat, ls);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +
 +    /* need to reallocate if we have to copy more atoms than the amount of space
 +       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 +    if (natoms > d_atdat->nalloc)
 +    {
 +        nalloc = over_alloc_small(natoms);
 +
 +        /* free up first if the arrays have already been initialized */
 +        if (d_atdat->nalloc != -1)
 +        {
 +            cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 +            cu_free_buffered(d_atdat->xq);
 +            cu_free_buffered(d_atdat->atom_types);
 +        }
 +
 +        stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 +        stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 +
 +        stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 +
 +        d_atdat->nalloc = nalloc;
 +        realloced = true;
 +    }
 +
 +    d_atdat->natoms = natoms;
 +    d_atdat->natoms_local = nbat->natoms_local;
 +
 +    /* need to clear GPU f output if realloc happened */
 +    if (realloced)
 +    {
 +        nbnxn_cuda_clear_f(cu_nb, nalloc);
 +    }
 +
 +    cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 +                      natoms*sizeof(*d_atdat->atom_types), ls);
 +
 +    if (bDoTime)
 +    {
 +        stat = cudaEventRecord(timers->stop_atdat, ls);
 +        CU_RET_ERR(stat, "cudaEventRecord failed");
 +    }
 +}
 +
 +void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
 +{
 +    cudaError_t     stat;
 +    cu_atomdata_t   *atdat;
 +    cu_nbparam_t    *nbparam;
 +    cu_plist_t      *plist, *plist_nl;
 +    cu_timers_t     *timers;
 +
 +    if (cu_nb == NULL) return;
 +
 +    atdat       = cu_nb->atdat;
 +    nbparam     = cu_nb->nbparam;
 +    plist       = cu_nb->plist[eintLocal];
 +    plist_nl    = cu_nb->plist[eintNonlocal];
 +    timers      = cu_nb->timers;
 +
 +    if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
 +    {
 +      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 +      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
 +      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 +    }
 +
 +    stat = cudaEventDestroy(cu_nb->nonlocal_done);
 +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 +    stat = cudaEventDestroy(cu_nb->misc_ops_done);
 +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 +
 +    if (cu_nb->bDoTime)
 +    {
 +        stat = cudaEventDestroy(timers->start_atdat);
 +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 +        stat = cudaEventDestroy(timers->stop_atdat);
 +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 +
 +        /* The non-local counters/stream (second in the array) are needed only with DD. */
 +        for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
 +        {
 +            stat = cudaEventDestroy(timers->start_nb_k[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 +            stat = cudaEventDestroy(timers->stop_nb_k[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 +
 +            stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 +            stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 +
 +            stat = cudaStreamDestroy(cu_nb->stream[i]);
 +            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 +
 +            stat = cudaEventDestroy(timers->start_nb_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
 +            stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
 +
 +            stat = cudaEventDestroy(timers->start_nb_d2h[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
 +            stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
 +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
 +        }
 +    }
 +
 +    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
 +    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
 +    cu_free_buffered(nbparam->nbfp);
 +
 +    stat = cudaFree(atdat->shift_vec);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
 +    stat = cudaFree(atdat->fshift);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
 +
 +    stat = cudaFree(atdat->e_lj);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
 +    stat = cudaFree(atdat->e_el);
 +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
 +
 +    cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
 +    cu_free_buffered(atdat->xq);
 +    cu_free_buffered(atdat->atom_types, &atdat->ntypes);
 +
 +    cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
 +    cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
 +    cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
 +    if (cu_nb->bUseTwoStreams)
 +    {
 +        cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
 +        cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
 +        cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Cleaned up CUDA data structures.\n");
 +    }
 +}
 +
 +void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
 +{
 +    cudaError_t stat;
 +    cudaStream_t stream = cu_nb->stream[iloc];
 +
 +    stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
 +    CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
 +}
 +
 +wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
 +}
 +
 +void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    if (cu_nb->bDoTime)
 +    {
 +        init_timings(cu_nb->timings);
 +    }
 +}
 +
 +int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
 +{
 +    return cu_nb != NULL ?
 +        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
 +
 +}
index 22c6bb931801c58072e04aafe569ac8c8bb14b26,0000000000000000000000000000000000000000..c7ab7cc6c62fc2db701e8b7df5d451ca8f9278fa
mode 100644,000000..100644
--- /dev/null
@@@ -1,431 -1,0 +1,439 @@@
-     int *atib = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include "maths.h"
 +/* Note that floating-point constants in CUDA code should be suffixed
 + * with f (e.g. 0.5f), to stop the compiler producing intermediate
 + * code that is in double precision.
 + */
 +
 +#if __CUDA_ARCH__ >= 300
 +#define REDUCE_SHUFFLE
 +/* On Kepler pre-loading i-atom types to shmem gives a few %,
 +   but on Fermi it does not */
 +#define IATYPE_SHMEM
 +#endif
 +
 +/*
 +   Kernel launch parameters:
 +    - #blocks   = #pair lists, blockId = pair list Id
 +    - #threads  = CL_SIZE^2
 +    - shmem     = CL_SIZE^2 * sizeof(float)
 +
 +    Each thread calculates an i force-component taking one pair of i-j atoms.
 + */
 +#ifdef PRUNE_NBL
 +#ifdef CALC_ENERGIES
 +__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
 +#else
 +__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _prune)
 +#endif
 +#else
 +#ifdef CALC_ENERGIES
 +__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener)
 +#else
 +__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 +#endif
 +#endif
 +            (const cu_atomdata_t atdat,
 +             const cu_nbparam_t nbparam,
 +             const cu_plist_t plist,
 +             bool bCalcFshift)
 +{
 +    /* convenience variables */
 +    const nbnxn_sci_t *pl_sci   = plist.sci;
 +#ifndef PRUNE_NBL
 +    const
 +#endif
 +    nbnxn_cj4_t *pl_cj4         = plist.cj4;
 +    const nbnxn_excl_t *excl    = plist.excl;
 +    const int *atom_types       = atdat.atom_types;
 +    int ntypes                  = atdat.ntypes;
 +    const float4 *xq            = atdat.xq;
 +    float3 *f                   = atdat.f;
 +    const float3 *shift_vec     = atdat.shift_vec;
 +    float rcoulomb_sq           = nbparam.rcoulomb_sq;
 +#ifdef VDW_CUTOFF_CHECK
 +    float rvdw_sq               = nbparam.rvdw_sq;
 +    float vdw_in_range;
 +#endif
 +#ifdef EL_RF
 +    float two_k_rf              = nbparam.two_k_rf;
 +#endif
 +#ifdef EL_EWALD
 +    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
 +#endif
 +#ifdef PRUNE_NBL
 +    float rlist_sq              = nbparam.rlist_sq;
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    float lj_shift    = nbparam.sh_invrc6;
 +#ifdef EL_EWALD
 +    float beta        = nbparam.ewald_beta;
 +    float ewald_shift = nbparam.sh_ewald;
 +#else
 +    float c_rf        = nbparam.c_rf;
 +#endif
 +    float *e_lj       = atdat.e_lj;
 +    float *e_el       = atdat.e_el;
 +#endif
 +
 +    /* thread/block/warp id-s */
 +    unsigned int tidxi  = threadIdx.x;
 +    unsigned int tidxj  = threadIdx.y;
 +    unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
 +    unsigned int bidx   = blockIdx.x;
 +    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
 +
 +    int sci, ci, cj, ci_offset,
 +        ai, aj,
 +        cij4_start, cij4_end,
 +        typei, typej,
 +        i, jm, j4, wexcl_idx;
 +    float qi, qj_f,
 +          r2, inv_r, inv_r2, inv_r6,
 +          c6, c12,
 +          int_bit,
 +#ifdef CALC_ENERGIES
 +          E_lj, E_el, E_lj_p,
 +#endif
 +          F_invr;
 +    unsigned int wexcl, imask, mask_ji;
 +    float4 xqbuf;
 +    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
 +    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
 +    nbnxn_sci_t nb_sci;
 +
 +    /* shmem buffer for i x+q pre-loading */
 +    extern __shared__  float4 xqib[];
++    /* shmem buffer for cj, for both warps separately */
++    int *cjs     = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
 +#ifdef IATYPE_SHMEM
 +    /* shmem buffer for i atom-type pre-loading */
-     float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
++    int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
 +#endif
 +
 +#ifndef REDUCE_SHUFFLE
 +    /* shmem j force buffer */
 +#ifdef IATYPE_SHMEM
 +    float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
 +#else
-                     cj      = pl_cj4[j4].cj[jm];
++    float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
 +#endif
 +#endif
 +
 +    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
 +    sci         = nb_sci.sci;           /* super-cluster */
 +    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
 +    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
 +
 +    /* Store the i-atom x and q in shared memory */
 +    /* Note: the thread indexing here is inverted with respect to the
 +       inner-loop as this results in slightly higher performance */
 +    ci = sci * NCL_PER_SUPERCL + tidxi;
 +    ai = ci * CL_SIZE + tidxj;
 +    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
 +#ifdef IATYPE_SHMEM
 +    ci = sci * NCL_PER_SUPERCL + tidxj;
 +    ai = ci * CL_SIZE + tidxi;
 +    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
 +#endif
 +    __syncthreads();
 +
 +    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        fci_buf[ci_offset] = make_float3(0.0f);
 +    }
 +
 +#ifdef CALC_ENERGIES
 +    E_lj = 0.0f;
 +    E_el = 0.0f;
 +
 +#if defined EL_EWALD || defined EL_RF
 +    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
 +    {
 +        /* we have the diagonal: add the charge self interaction energy term */
 +        for (i = 0; i < NCL_PER_SUPERCL; i++)
 +        {
 +            qi    = xqib[i * CL_SIZE + tidxi].w;
 +            E_el += qi*qi;
 +        }
 +        /* divide the self term equally over the j-threads */
 +        E_el /= CL_SIZE;
 +#ifdef EL_RF
 +        E_el *= -nbparam.epsfac*0.5f*c_rf;
 +#else
 +        E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 +#endif
 +    }
 +#endif
 +#endif
 +
 +    /* skip central shifts when summing shift forces */
 +    if (nb_sci.shift == CENTRAL)
 +    {
 +        bCalcFshift = false;
 +    }
 +
 +    fshift_buf = make_float3(0.0f);
 +
 +    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
 +    for (j4 = cij4_start; j4 < cij4_end; j4++)
 +    {
 +        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
 +        imask       = pl_cj4[j4].imei[widx].imask;
 +        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
 +
 +#ifndef PRUNE_NBL
 +        if (imask)
 +#endif
 +        {
++            /* Pre-load cj into shared memory on both warps separately */
++            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
++            {
++                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
++            }
++
 +            /* Unrolling this loop
 +               - with pruning leads to register spilling;
 +               - on Kepler is much slower;
 +               - doesn't work on CUDA <v4.1
 +               Tested with nvcc 3.2 - 5.0.7 */
 +#if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
 +#pragma unroll 4
 +#endif
 +            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
 +            {
 +                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
 +                {
 +                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
 +
++                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
 +                    aj      = cj * CL_SIZE + tidxj;
 +
 +                    /* load j atom data */
 +                    xqbuf   = xq[aj];
 +                    xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
 +                    qj_f    = nbparam.epsfac * xqbuf.w;
 +                    typej   = atom_types[aj];
 +
 +                    fcj_buf = make_float3(0.0f);
 +
 +                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
 +#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD || defined EL_RF))
 +#pragma unroll 8
 +#endif
 +                    for(i = 0; i < NCL_PER_SUPERCL; i++)
 +                    {
 +                        if (imask & mask_ji)
 +                        {
 +                            ci_offset   = i;    /* i force buffer offset */
 +
 +                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
 +                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
 +
 +                            /* all threads load an atom from i cluster ci into shmem! */
 +                            xqbuf   = xqib[i * CL_SIZE + tidxi];
 +                            xi      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
 +
 +                            /* distance between i and j atoms */
 +                            rv      = xi - xj;
 +                            r2      = norm2(rv);
 +
 +#ifdef PRUNE_NBL
 +                            /* If _none_ of the atoms pairs are in cutoff range,
 +                               the bit corresponding to the current
 +                               cluster-pair in imask gets set to 0. */
 +                            if (!__any(r2 < rlist_sq))
 +                            {
 +                                imask &= ~mask_ji;
 +                            }
 +#endif
 +
 +                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
 +
 +                            /* cutoff & exclusion check */
 +#if defined EL_EWALD || defined EL_RF
 +                            if (r2 < rcoulomb_sq *
 +                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
 +#else
 +                            if (r2 < rcoulomb_sq * int_bit)
 +#endif
 +                            {
 +                                /* load the rest of the i-atom parameters */
 +                                qi      = xqbuf.w;
 +#ifdef IATYPE_SHMEM
 +                                typei   = atib[i * CL_SIZE + tidxi];
 +#else
 +                                typei   = atom_types[ai];
 +#endif
 +
 +                                /* LJ 6*C6 and 12*C12 */
 +                                c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
 +                                c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
 +
 +                                /* avoid NaN for excluded pairs at r=0 */
 +                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
 +
 +                                inv_r   = rsqrt(r2);
 +                                inv_r2  = inv_r * inv_r;
 +                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
 +#if defined EL_EWALD || defined EL_RF
 +                                /* We could mask inv_r2, but with Ewald
 +                                 * masking both inv_r6 and F_invr is faster */
 +                                inv_r6  *= int_bit;
 +#endif
 +
 +                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
 +
 +#ifdef CALC_ENERGIES
 +                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
 +#endif
 +
 +#ifdef VDW_CUTOFF_CHECK
 +                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
 +                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
 +                                F_invr  *= vdw_in_range;
 +#ifdef CALC_ENERGIES
 +                                E_lj_p  *= vdw_in_range;
 +#endif
 +#endif
 +#ifdef CALC_ENERGIES
 +                                E_lj    += E_lj_p;
 +#endif
 +
 +
 +#ifdef EL_CUTOFF
 +                                F_invr  += qi * qj_f * inv_r2 * inv_r;
 +#endif
 +#ifdef EL_RF
 +                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
 +#endif
 +#ifdef EL_EWALD
 +                                F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +#ifdef EL_CUTOFF
 +                                E_el    += qi * qj_f * (inv_r - c_rf);
 +#endif
 +#ifdef EL_RF
 +                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
 +#endif
 +#ifdef EL_EWALD
 +                                /* 1.0f - erff is faster than erfcf */
 +                                E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
 +#endif
 +#endif
 +                                f_ij    = rv * F_invr;
 +
 +                                /* accumulate j forces in registers */
 +                                fcj_buf -= f_ij;
 +
 +                                /* accumulate i forces in registers */
 +                                fci_buf[ci_offset] += f_ij;
 +                            }
 +                        }
 +
 +                        /* shift the mask bit by 1 */
 +                        mask_ji += mask_ji;
 +                    }
 +
 +                    /* reduce j forces */
 +#ifdef REDUCE_SHUFFLE
 +                    reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj);
 +#else
 +                    /* store j forces in shmem */
 +                    f_buf[                  tidx] = fcj_buf.x;
 +                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
 +                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 +
 +                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
 +#endif
 +                }
 +            }
 +#ifdef PRUNE_NBL
 +            /* Update the imask with the new one which does not contain the
 +               out of range clusters anymore. */
 +            pl_cj4[j4].imei[widx].imask = imask;
 +#endif
 +        }
 +    }
 +
 +    /* reduce i forces */
 +    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 +#ifdef REDUCE_SHUFFLE
 +        reduce_force_i_warp_shfl(fci_buf[ci_offset], f,
 +                                 &fshift_buf, bCalcFshift,
 +                                 tidxj, ai);
 +#else
 +        f_buf[                  tidx] = fci_buf[ci_offset].x;
 +        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
 +        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 +        __syncthreads();
 +        reduce_force_i(f_buf, f,
 +                       &fshift_buf, bCalcFshift,
 +                       tidxi, tidxj, ai);
 +        __syncthreads();
 +#endif
 +    }
 +
 +    /* add up local shift forces into global mem */
 +#ifdef REDUCE_SHUFFLE
 +    if (bCalcFshift && (tidxj == 0 || tidxj == 4))
 +#else
 +    if (bCalcFshift && tidxj == 0)
 +#endif
 +    {
 +        atomicAdd(&atdat.fshift[nb_sci.shift].x, fshift_buf.x);
 +        atomicAdd(&atdat.fshift[nb_sci.shift].y, fshift_buf.y);
 +        atomicAdd(&atdat.fshift[nb_sci.shift].z, fshift_buf.z);
 +    }
 +
 +#ifdef CALC_ENERGIES
 +#ifdef REDUCE_SHUFFLE
 +    /* reduce the energies over warps and store into global memory */
 +    reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx);
 +#else
 +    /* flush the energies to shmem and reduce them */
 +    f_buf[              tidx] = E_lj;
 +    f_buf[FBUF_STRIDE + tidx] = E_el;
 +    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 +#endif
 +#endif
 +}
index 5b16fc06a7326e90b870e86a9aee0ee89260229b,0000000000000000000000000000000000000000..d0f32ee8e1b8e3ce4c8bdfcc74c72912315474fe
mode 100644,000000..100644
--- /dev/null
@@@ -1,5031 -1,0 +1,5040 @@@
-     if (nbat->bUseBufferFlags && LOCAL_I(iloc))
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
 +/* Pair search box lower and upper corner in x,y,z.
 + * Store this in 4 iso 3 reals, which is useful with SSE.
 + * To avoid complicating the code we also use 4 without SSE.
 + */
 +#define NNBSBB_C         4
 +#define NNBSBB_B         (2*NNBSBB_C)
 +/* Pair search box lower and upper bound in z only. */
 +#define NNBSBB_D         2
 +/* Pair search box lower and upper corner x,y,z indices */
 +#define BBL_X  0
 +#define BBL_Y  1
 +#define BBL_Z  2
 +#define BBU_X  4
 +#define BBU_Y  5
 +#define BBU_Z  6
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* We use SSE or AVX-128bit for bounding box calculations */
 +
 +#ifndef GMX_DOUBLE
 +/* Single precision BBs + coordinates, we can also load coordinates using SSE */
 +#define NBNXN_SEARCH_SSE_SINGLE
 +#endif
 +
 +/* Include basic SSE2 stuff */
 +#include <emmintrin.h>
 +
 +#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
 +/* Store bounding boxes with x, y and z coordinates in packs of 4 */
 +#define NBNXN_PBB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_PBB        4
 +#define STRIDE_PBB_2LOG   2
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef GMX_NBNXN_SIMD
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#ifdef GMX_DOUBLE
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#else
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
 +/* Half SIMD with j-cluster size */
 +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
 +#endif
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_WIDTH"
 +#endif
 +#endif
 +
 +#endif /* GMX_NBNXN_SIMD */
 +
 +
 +/* Interaction masks for 4xN atom interactions.
 + * Bit i*CJ_SIZE + j tells if atom i and j interact.
 + */
 +/* All interaction mask is the same for all kernels */
 +#define NBNXN_INT_MASK_ALL        0xffffffff
 +/* 4x4 kernel diagonal mask */
 +#define NBNXN_INT_MASK_DIAG       0x08ce
 +/* 4x2 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
 +#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
 +/* 4x8 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
 +#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for(i=0; i<enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp,"\n");
 +    fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp," comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp," s. th");
 +        for(t=0; t<nbs->nthread_max; t++)
 +        {
 +            fprintf(fp," %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp,"\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +    case nbnxnk4x4_PlainC:
 +    case nbnxnk4xN_SIMD_4xN:
 +    case nbnxnk4xN_SIMD_2xNN:
 +        return NBNXN_CPU_CLUSTER_I_SIZE;
 +    case nbnxnk8x8x8_CUDA:
 +    case nbnxnk8x8x8_PlainC:
 +        /* The cluster size for super/sub lists is only set here.
 +         * Any value should work for the pair-search and atomdata code.
 +         * The kernels, of course, might require a particular value.
 +         */
 +        return NBNXN_GPU_CLUSTER_SIZE;
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
 +    int nbnxn_simd_width=0;
 +    int cj_size=0;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
 +#endif
 +
 +    switch (nb_kernel_type)
 +    {
 +    case nbnxnk4x4_PlainC:
 +        cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
 +        break;
 +    case nbnxnk4xN_SIMD_4xN:
 +        cj_size = nbnxn_simd_width;
 +        break;
 +    case nbnxnk4xN_SIMD_2xNN:
 +        cj_size = nbnxn_simd_width/2;
 +        break;
 +    case nbnxnk8x8x8_CUDA:
 +    case nbnxnk8x8x8_PlainC:
 +        cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +        break;
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
 +    return cj_size;
 +}
 +
 +static int ci_to_cj(int na_cj_2log,int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +    case 2: return  ci;     break;
 +    case 1: return (ci<<1); break;
 +    case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
 +    if (nb_kernel_type == nbnxnkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
 +    case nbnxnk8x8x8_CUDA:
 +    case nbnxnk8x8x8_PlainC:
 +        return FALSE;
 +
 +    case nbnxnk4x4_PlainC:
 +    case nbnxnk4xN_SIMD_4xN:
 +    case nbnxnk4xN_SIMD_2xNN:
 +        return TRUE;
 +
 +    default:
 +        gmx_incons("Invalid nonbonded kernel type passed!");
 +        return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
 +                       ivec *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int d,g,t;
 +
 +    snew(nbs,1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid,nbs->ngrid);
 +    for(g=0; g<nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work,nbs->nthread_max);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n,rvec corner0,rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int dd_zone,
 +                            int n,rvec corner0,rvec corner1,
 +                            real atom_density,
 +                            int XFormat)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens,tlen,tlen_x,tlen_y,nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c,grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1,(int)(size[XX]/tlen_x));
 +        grid->ncy = max(1,(int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    grid->sx = size[XX]/grid->ncx;
 +    grid->sy = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    if (dd_zone > 0)
 +    {
 +        /* This is a non-home zone, add an extra row of cells
 +         * for particles communicated for bonded interactions.
 +         * These can be beyond the cut-off. It doesn't matter where
 +         * they end up on the grid, but for performance it's better
 +         * if they don't end up in cells that can be within cut-off range.
 +         */
 +        grid->ncx++;
 +        grid->ncy++;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na,grid->cxy_nalloc);
 +        srenew(grid->cxy_ind,grid->cxy_nalloc+1);
 +    }
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
 +        int bb_nalloc;
 +
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc,grid->nc_nalloc);
 +        srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
 +#ifdef NBNXN_PBB_SSE
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
 +#else
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 +#endif
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
 +        snew_aligned(grid->bb,bb_nalloc,16);
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
 +                snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
 +            }
 +        }
 +
 +        srenew(grid->flags,grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0,grid->c0);
 +    copy_rvec(corner1,grid->c1);
 +
 +    return nc_max;
 +}
 +
 +/* We need to sort paricles in grid columns on z-coordinate.
 + * As particle are very often distributed homogeneously, we a sorting
 + * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 + * by a factor, cast to an int and try to store in that hole. If the hole
 + * is full, we move this or another particle. A second pass is needed to make
 + * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 + * 4 is the optimal value for homogeneous particle distribution and allows
 + * for an O(#particles) sort up till distributions were all particles are
 + * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 + * as it can be expensive to detect imhomogeneous particle distributions.
 + * SGSF is the maximum ratio of holes used, in the worst case all particles
 + * end up in the last hole and we need #particles extra holes at the end.
 + */
 +#define SORT_GRID_OVERSIZE 4
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +/* Sort particle index a on coordinates x along dim.
 + * Backwards tells if we want decreasing iso increasing coordinates.
 + * h0 is the minimum of the coordinate range.
 + * invh is the inverse hole spacing.
 + * nsort, the theortical hole limit, is only used for debugging.
 + * sort is the sorting work array.
 + */
 +static void sort_atoms(int dim,gmx_bool Backwards,
 +                       int *a,int n,rvec *x,
 +                       real h0,real invh,int nsort,int *sort)
 +{
 +    int i,c;
 +    int zi,zim,zi_min,zi_max;
 +    int cp,tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +    /* Determine the index range used, so we can limit it for the second pass */
 +    zi_min = INT_MAX;
 +    zi_max = -1;
 +
 +    /* Sort the particles using a simple index sort */
 +    for(i=0; i<n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +        if (zi < 0 || zi >= nsort)
 +        {
 +            gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
 +                      a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +            zi_min = min(zi_min,zi);
 +            zi_max = max(zi_max,zi);
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                    (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                     a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp = sort[zim];
 +                    sort[zim] = cp;
 +                    cp  = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +                zi_max = max(zi_max,zim);
 +            }
 +            sort[zi] = a[i];
 +            zi_max = max(zi_max,zi);
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for(zi=0; zi<nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(zi=zi_max; zi>=zi_min; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
 +static void calc_bounding_box(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X4]);
 +        xh = max(xh,x[j+XX*PACK_X4]);
 +        yl = min(yl,x[j+YY*PACK_X4]);
 +        yh = max(yh,x[j+YY*PACK_X4]);
 +        zl = min(zl,x[j+ZZ*PACK_X4]);
 +        zh = max(zh,x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X8]);
 +        xh = max(xh,x[j+XX*PACK_X8]);
 +        yl = min(yl,x[j+YY*PACK_X8]);
 +        yh = max(yh,x[j+YY*PACK_X8]);
 +        zl = min(zl,x[j+ZZ*PACK_X8]);
 +        zh = max(zh,x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na,const real *x,
 +                                          float *bb,float *bbj)
 +{
 +    calc_bounding_box_x_x4(min(na,2),x,bbj);
 +
 +    if (na > 2)
 +    {
 +        calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +        _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
 +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
 +    }
 +
 +    _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
 +                                        _mm_load_ps(bbj+NNBSBB_B)));
 +    _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
 +                                        _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
 +}
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_PBB] = R2F_D(xl);
 +    bb[1*STRIDE_PBB] = R2F_D(yl);
 +    bb[2*STRIDE_PBB] = R2F_D(zl);
 +    bb[3*STRIDE_PBB] = R2F_U(xh);
 +    bb[4*STRIDE_PBB] = R2F_U(yh);
 +    bb[5*STRIDE_PBB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
 +static void calc_bounding_box_sse(int na,const float *x,float *bb)
 +{
 +    __m128 bb_0_SSE,bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int  i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for(i=1; i<na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
 +    }
 +
 +    _mm_store_ps(bb  ,bb_0_SSE);
 +    _mm_store_ps(bb+4,bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na,const float *x,
 +                                       float *bb_work,
 +                                       real *bb)
 +{
 +    calc_bounding_box_sse(na,x,bb_work);
 +
 +    bb[0*STRIDE_PBB] = bb_work[BBL_X];
 +    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
 +    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
 +    bb[3*STRIDE_PBB] = bb_work[BBU_X];
 +    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
 +    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Combines pairs of consecutive bounding boxes */
 +static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
 +{
 +    int    i,j,sc2,nc2,c2;
 +    __m128 min_SSE,max_SSE;
 +
 +    for(i=0; i<grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for(c2=sc2; c2<sc2+nc2; c2++)
 +        {
 +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
 +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
 +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
 +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for(j=0; j<NNBSBB_C; j++)
 +            {
 +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
 +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
 +            }
 +        }
 +    }
 +}
 +
 +#endif
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t *grid)
 +{
 +    int  c,d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for(c=0; c<grid->nc; c++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t *grid)
 +{
 +    int  ns,c,s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for(c=0; c<grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
 +        {
 +            int cs_w,i,d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
 +            for(i=0; i<STRIDE_PBB; i++)
 +            {
 +                for(d=0; d<DIM; d++)
 +                {
 +                    ba[d] +=
 +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 +                }
 +            }
 +        }
 +#else
 +        for(s=0; s<grid->nsubc[c]; s++)
 +        {
 +            int cs,d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for(d=0; d<DIM; d++)
 +            {
 +                ba[d] +=
 +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
 +                    grid->bb[cs*NNBSBB_B         +d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
 +                int a0,int a1,const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int subc,s,a,n1,n2,a_lj_max,i,j;
 +    int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for(s=a0; s<a1; s+=na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1 = 0;
 +        n2 = 0;
 +        haveQ = FALSE;
 +        a_lj_max = -1;
 +        for(a=s; a<min(s+na_c,a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for(i=0; i<n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for(j=0; j<n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0,int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx,int sy, int sz,
 +               float *bb_work)
 +{
 +    int    na,a;
 +    size_t offset;
 +    float  *bb_ptr;
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for(a=a0; a<a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
 +                           nbat->XFormat,nbat->x,a0,
 +                           sx,sy,sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +                             */
 +        bb_ptr =
 +            grid->bb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
 +                                       bb_work,bb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                                   bb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
 +                    bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
 +                    bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +
 +        calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start,int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz,ncz,cfilled,c;
 +    int  na,ash,ind,a;
 +    int  na_c,ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for(cz=0; cz<ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz ;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc,na-(ash_c-ash));
 +
 +            fill_cell(nbs,grid,nbat,
 +                      ash_c,ash_c+na_c,atinfo,x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
 +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
 +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0,int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start,int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz=-1,c=-1,ncz;
 +    int  na,ash,na_c,ind,a;
 +    int  subdiv_z,sub_z,na_z,ash_z;
 +    int  subdiv_y,sub_y,na_y,ash_y;
 +    int  subdiv_x,sub_x,na_x,ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    float bb_work_array[NNBSBB_B+3],*bb_work_align;
 +
 +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z,na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz ;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc,na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY,(sub_z & 1),
 +                       nbs->a+ash_z,na_z,x,
 +                       grid->c0[YY]+cy*grid->sy,grid->inv_sy,
 +                       subdiv_y*SGSF,sort_work);
 +#endif
 +
 +            for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y,na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y,na_y,x,
 +                           grid->c0[XX]+cx*grid->sx,grid->inv_sx,
 +                           subdiv_x*SGSF,sort_work);
 +#endif
 +
 +                for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x,na-(ash_x-ash));
 +
 +                    fill_cell(nbs,grid,nbat,
 +                              ash_x,ash_x+na_x,atinfo,x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
 +                              bb_work_align);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                rvec *x,
 +                                int dd_zone,const int *move,
 +                                int thread,int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    if (dd_zone == 0)
 +    {
 +        /* Home zone */
 +        for(i=n0; i<n1; i++)
 +        {
 +            if (move == NULL || move[i] >= 0)
 +            {
 +                /* We need to be careful with rounding,
 +                 * particles might be a few bits outside the local zone.
 +                 * The int cast takes care of the lower bound,
 +                 * we will explicitly take care of the upper bound.
 +                 */
 +                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +                if (cx < 0 || cx >= grid->ncx ||
 +                    cy < 0 || cy >= grid->ncy)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "grid cell cx %d cy %d out of range (max %d %d)\n"
 +                              "atom %f %f %f, grid->c0 %f %f",
 +                              cx,cy,grid->ncx,grid->ncy,
 +                              x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
 +                }
 +#endif
 +                /* Take care of potential rouding issues */
 +                cx = min(cx,grid->ncx - 1);
 +                cy = min(cy,grid->ncy - 1);
 +
 +                /* For the moment cell will contain only the, grid local,
 +                 * x and y indices, not z.
 +                 */
 +                cell[i] = cx*grid->ncy + cy;
 +            }
 +            else
 +            {
 +                /* Put this moved particle after the end of the grid,
 +                 * so we can process it later without using conditionals.
 +                 */
 +                cell[i] = grid->ncx*grid->ncy;
 +            }
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +    else
 +    {
 +        /* Non-home zone */
 +        for(i=n0; i<n1; i++)
 +        {
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +            /* For non-home zones there could be particles outside
 +             * the non-bonded cut-off range, which have been communicated
 +             * for bonded interactions only. For the result it doesn't
 +             * matter where these end up on the grid. For performance
 +             * we put them in an extra row at the border.
 +             */
 +            cx = max(cx,0);
 +            cx = min(cx,grid->ncx - 1);
 +            cy = max(cy,0);
 +            cy = min(cy,grid->ncy - 1);
 +
 +            /* For the moment cell will contain only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0,int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy,cxy,ncz_max,ncz;
 +    int  nthread,thread;
 +    int  *cxy_na,cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
 +                            nbs->cell,nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max = 0;
 +    ncz = 0;
 +    grid->cxy_ind[0] = 0;
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for(thread=1; thread<nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc,grid->na_c,grid->nc,
 +                grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for(cy=0; cy<grid->ncy; cy++)
 +            {
 +                for(cx=0; cx<grid->ncx; cx++)
 +                {
 +                    fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug,"\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for(thread=0; thread<nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +            /* When not in use, all elements should be -1 */
 +            for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
 +            {
 +                nbs->work[thread].sort_work[i] = -1;
 +            }
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for(i=a0; i<a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    if (dd_zone == 0)
 +    {
 +        /* Set the cell indices for the moved particles */
 +        n0 = grid->nc*grid->na_sc;
 +        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +        if (dd_zone == 0)
 +        {
 +            for(i=n0; i<n1; i++)
 +            {
 +                nbs->cell[nbs->a[i]] = i;
 +            }
 +        }
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for(thread=0; thread<nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb);
 +    }
 +#endif
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for(i=0; i<grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug,nbs,grid);
 +        }
 +        else
 +        {
 +            fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug,nbs,grid);
 +        }
 +    }
 +}
 +
 +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 +                              int natoms)
 +{
 +    int b;
 +
 +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 +    if (flags->nflag > flags->flag_nalloc)
 +    {
 +        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag,flags->flag_nalloc);
 +    }
 +    for(b=0; b<flags->nflag; b++)
 +    {
 +        flags->flag[b] = 0;
 +    }
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC,matrix box,
 +                       int dd_zone,
 +                       rvec corner0,rvec corner1,
 +                       int a0,int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved,int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int n;
 +    int nc_max_grid,nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box,nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs,grid,
 +                                   dd_zone,n-nmoved,corner0,corner1,
 +                                   nbs->grid[0].atom_density,
 +                                   nbat->XFormat);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell,nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a,nbs->a_nalloc);
 +    }
 +
 +    /* We need padding up to a multiple of the buffer flag size: simply add */
 +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
 +        nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                int nb_kernel_type,
 +                                nbnxn_atomdata_t *nbat)
 +{
 +    int  zone,d;
 +    rvec c0,c1;
 +
 +    for(zone=1; zone<zones->n; zone++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
 +                          zone,c0,c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0,NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    float *bbcz,*bb;
 +    int ncd,sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
 +        srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
 +        srenew(grid->flags_simple,grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(sc=0; sc<grid->nc; sc++)
 +    {
 +        int c,tx,na;
 +
 +        for(c=0; c<ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                case nbatX4:
 +                    /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                    calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                case nbatX8:
 +                    /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                    calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                default:
 +                    calc_bounding_box(na,nbat->xstride,
 +                                      nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
 +                                      bb+tx*NNBSBB_B);
 +                    break;
 +                }
 +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
 +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb_simple);
 +    }
 +#endif
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int ao,cx,cy,cxy,cz,j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for(cx=0; cx<grid->ncx; cx++)
 +    {
 +        for(cy=0; cy<grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for(cz=0; cz<grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0,real b1,
 +                           int nc,real c0,real s,real invs,
 +                           real d2,real r2,int *cf,int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs),0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs),nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0,float bx1,float by0,
 +                       float by1,float bz0,float bz1,
 +                       const float *bb)
 +{
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    d2 = 0;
 +
 +    dl  = bx0 - bb[BBU_X];
 +    dh  = bb[BBL_X] - bx1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = by0 - bb[BBU_Y];
 +    dh  = bb[BBL_Y] - by1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bz0 - bb[BBU_Z];
 +    dh  = bb[BBL_Z] - bz1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
 +static float subc_bb_dist2(int si,const float *bb_i_ci,
 +                           int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    d2 = 0;
 +
 +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
 +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
 +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
 +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
 +static float subc_bb_dist2_sse(int na_c,
 +                              int si,const float *bb_i_ci,
 +                              int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +
 +    __m128 bb_i_SSE0,bb_i_SSE1;
 +    __m128 bb_j_SSE0,bb_j_SSE1;
 +    __m128 dl_SSE;
 +    __m128 dh_SSE;
 +    __m128 dm_SSE;
 +    __m128 dm0_SSE;
 +    __m128 d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float d2_array[7],*d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    bb_i_SSE0 = _mm_load_ps(bb_i);
 +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
 +    bb_j_SSE0 = _mm_load_ps(bb_j);
 +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
 +
 +    _mm_store_ps(d2_align,d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
 +
 +    _mm_store_ss(&d2,d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
 +{                                                \
 +    int    shi;                                  \
 +                                                 \
 +    __m128 dx_0,dy_0,dz_0;                       \
 +    __m128 dx_1,dy_1,dz_1;                       \
 +                                                 \
 +    __m128 mx,my,mz;                             \
 +    __m128 m0x,m0y,m0z;                          \
 +                                                 \
 +    __m128 d2x,d2y,d2z;                          \
 +    __m128 d2s,d2t;                              \
 +                                                 \
 +    shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
 +    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
 +    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
 +    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
 +    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
 +    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
 +                                                 \
 +    dx_0 = _mm_sub_ps(xi_l,xj_h);                \
 +    dy_0 = _mm_sub_ps(yi_l,yj_h);                \
 +    dz_0 = _mm_sub_ps(zi_l,zj_h);                \
 +                                                 \
 +    dx_1 = _mm_sub_ps(xj_l,xi_h);                \
 +    dy_1 = _mm_sub_ps(yj_l,yi_h);                \
 +    dz_1 = _mm_sub_ps(zj_l,zi_h);                \
 +                                                 \
 +    mx   = _mm_max_ps(dx_0,dx_1);                \
 +    my   = _mm_max_ps(dy_0,dy_1);                \
 +    mz   = _mm_max_ps(dz_0,dz_1);                \
 +                                                 \
 +    m0x  = _mm_max_ps(mx,zero);                  \
 +    m0y  = _mm_max_ps(my,zero);                  \
 +    m0z  = _mm_max_ps(mz,zero);                  \
 +                                                 \
 +    d2x  = _mm_mul_ps(m0x,m0x);                  \
 +    d2y  = _mm_mul_ps(m0y,m0y);                  \
 +    d2z  = _mm_mul_ps(m0z,m0z);                  \
 +                                                 \
 +    d2s  = _mm_add_ps(d2x,d2y);                  \
 +    d2t  = _mm_add_ps(d2s,d2z);                  \
 +                                                 \
 +    _mm_store_ps(d2+si,d2t);                     \
 +}
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi,const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l,yj_l,zj_l;
 +    __m128 xj_h,yj_h,zj_h;
 +    __m128 xi_l,yi_l,zi_l;
 +    __m128 xi_h,yi_h,zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
 +    if (STRIDE_PBB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si,const real *x_i,
 +                                int csj,int stride,const real *x_j,
 +                                real rl2)
 +{
 +    int  i,j,i0,j0;
 +    real d2;
 +
 +    for(i=0; i<na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for(j=0; j<na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                 sqr(x_i[i0+1] - x_j[j0+1]) +
 +                 sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si,const real *x_i,
 +                                   int csj,int stride,const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0,iy_SSE0,iz_SSE0;
 +    __m128 ix_SSE1,iy_SSE1,iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int na_c_sse;
 +    int j0,j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
 +    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
 +    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
 +    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
 +    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
 +    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
 +    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE,jy0_SSE,jz0_SSE;
 +        __m128 jx1_SSE,jy1_SSE,jz1_SSE;
 +
 +        __m128 dx_SSE0,dy_SSE0,dz_SSE0;
 +        __m128 dx_SSE1,dy_SSE1,dz_SSE1;
 +        __m128 dx_SSE2,dy_SSE2,dz_SSE2;
 +        __m128 dx_SSE3,dy_SSE3,dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int nsupercell)
 +{
 +    int ncj4_max,j4,j,w,t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc,nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for(t=0; t<WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INT_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool bSimple,
 +                                nbnxn_alloc_t *alloc,
 +                                nbnxn_free_t  *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl,1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work,1);
 +#ifdef NBNXN_BBXXXX
 +    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
 +#else
 +    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
 +#ifdef GMX_NBNXN_SIMD
 +    snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
 +    snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
 +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 +                  nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl,nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for(i=0; i<nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i],1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int cs[SHIFTS];
 +    int s,i,j;
 +    int npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nci %d ncj %d\n",
 +            nbl->nci,nbl->ncj);
 +    fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int i,j4,j,si,b;
 +    int c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
 +    fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp,"nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/((double)nbl->ncj4));
 +
 +    for(si=0; si<=GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                b = 0;
 +                for(si=0; si<GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for(b=0; b<=GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
 +/* Print the full pair list, used for debug output */
 +static void print_supersub_nsp(const char *fn,
 +                               const nbnxn_pairlist_t *nbl,
 +                               int iloc)
 +{
 +    char buf[STRLEN];
 +    FILE *fp;
 +    int i,nsp,j4,p;
 +
 +    sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
 +    fp = ffopen(buf,"w");
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        nsp = 0;
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
 +            {
 +                nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
 +            }
 +        }
 +        fprintf(fp,"%4d %3d %3d\n",
 +                i,
 +                nsp,
 +                nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
 +    }
 +
 +    fclose(fp);
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
 +                                   int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
 +                                 int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl,1);
 +    }
 +    low_get_nbl_exclusions(nbl,cj4,warp,excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl,2);
 +
 +    low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
 +    low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind,int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int  ei,ej,w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for(ej=0; ej<nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for(ei=ej; ei<nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL + si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 +static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
 +             NBNXN_INT_MASK_ALL));
 +#endif
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 +static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 8 */
 +    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
 +             NBNXN_INT_MASK_ALL));
 +#else              /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#endif
 +}
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd128
 +#else
 +#if GMX_NBNXN_SIMD_BITWIDTH == 256
 +#define get_imask_x86_simd_4xn  get_imask_x86_simd256
 +#define get_imask_x86_simd_2xnn get_imask_x86_simd128
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 +#endif
 +#endif
 +#endif
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci,int cjf,int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2,float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
 +    const float *bb_ci;
 +    const real  *x_ci;
 +
 +    gmx_bool   InRange;
 +    real       d2;
 +    int        cjf_gl,cjl_gl,cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for(cj=cjf; cj<=cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#ifdef GMX_NBNXN_SIMD_4XN
 +#include "nbnxn_search_simd_4xn.h"
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +#include "nbnxn_search_simd_2xnn.h"
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 +                                       const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci,int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride,const real *x,
 +                                       real rl2,float rbb2,
 +                                       int *ndistc)
 +{
 +    int  na_c;
 +    int  npair;
 +    int  cjo,ci1,ci,cj,cj_gl;
 +    int  cj4_ind,cj_offset;
 +    unsigned imask;
 +    nbnxn_cj4_t *cj4;
 +    const float *bb_ci;
 +    const real *x_ci;
 +    float *d2l,d2;
 +    int  w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last=-1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
 +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
 +                               ci1,bb_ci,d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for(ci=0; ci<GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
 +                (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_PBB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                                (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end =
 +                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t *nbl,
 +                             gmx_bool diagRemoved,
 +                             int na_ci_2log,
 +                             int na_cj_2log,
 +                             const nbnxn_ci_t *nbl_ci,
 +                             const t_blocka *excl)
 +{
 +    const int *cell;
 +    int ci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t *nbl,
 +                              gmx_bool diagRemoved,
 +                              int na_c_2log,
 +                              const nbnxn_sci_t *nbl_sci,
 +                              const t_blocka *excl)
 +{
 +    const int *cell;
 +    int na_c;
 +    int sci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e,w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl,cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl,cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
 +/* Macro for getting the index of an i-atom within a warp */
 +#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
 +
 +                        if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
 +
 +                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODCJ4
 +#undef A2CJ4
 +#undef AMODWI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
 +                         nbnxn_list_work_t *work)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl,nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
 +                          nbnxn_list_work_t *work)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl,nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew,j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj,work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for(j=0; j<ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
 +    {
 +        for(j=0; j<ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INT_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for(j=0; j<ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 +
 +        /* The counts below are used for non-bonded pair/flop counts
 +         * and should therefore match the available kernel setups.
 +         */
 +        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
 +                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
 + * As we only now the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
 + * both on nthread and our own thread index thread.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av,gmx_bool progBal,int nc_bal,
 +                            int thread,int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start,cj4_end,j4len,cj4;
 +    int sci;
 +    int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
 +    int p;
 +
 +    /* Estimate the total numbers of ci's of the nblist combined
 +     * over all threads using the target number of ci's.
 +     */
 +    nsci_est = nc_bal*thread/nthread + nbl->nsci;
 +    if (progBal)
 +    {
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
 +        cj4        = cj4_start;
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
 +        while (cj4 < cj4_end)
 +        {
 +            nsp_cj4_p = nsp_cj4;
 +            nsp_cj4   = 0;
 +            for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
 +            nsp += nsp_cj4;
 +
 +            if (nsp > nsp_max && nsp > nsp_cj4)
 +            {
 +                nbl->sci[sci].cj4_ind_end = cj4;
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl,nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
 +                nsp_sci   = nsp - nsp_cj4;
 +                nsp_cj4_e = nsp_cj4_p;
 +                nsp       = nsp_cj4;
 +            }
 +
 +            cj4++;
 +        }
 +
 +        /* Put the remaining cj4's in a new ci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
 +        /* Possibly balance out the last two ci's
 +         * by moving the last cj4 of the second last ci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
 +        sci++;
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal,int nc_bal,
 +                                    int thread,int nthread)
 +{
 +    int j4len,tlen;
 +    int nb,b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
 +            split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
 +static void set_icell_bb_simple(const float *bb,int ci,
 +                                real shx,real shy,real shz,
 +                                float *bb_ci)
 +{
 +    int ia;
 +
 +    ia = ci*NNBSBB_B;
 +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
 +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
 +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
 +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
 +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
 +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
 +}
 +
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
 +static void set_icell_bb_supersub(const float *bb,int ci,
 +                                  real shx,real shy,real shz,
 +                                  float *bb_ci)
 +{
 +    int ia,m,i;
 +
 +#ifdef NBNXN_BBXXXX
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
 +    for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
 +    {
 +        for(i=0; i<STRIDE_PBB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
 +            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
 +            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
 +            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
 +            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
 +            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
 +        }
 +    }
 +#else
 +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
 +    for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
 +    {
 +        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
 +        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
 +        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
 +        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
 +        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
 +        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
 +    }
 +#endif
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx,real shy,real shz,
 +                               int na_c,
 +                               int stride,const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx,real shy,real shz,
 +                                 int na_c,
 +                                 int stride,const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for(i=0; i<GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx,real shy,real shz,
 +                                      int na_c,
 +                                      int stride,const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si,io,ia,i,j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for(si=0; si<GPU_NSUBCELL; si++)
 +    {
 +        for(i=0; i<na_c; i+=STRIDE_PBB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for(j=0; j<STRIDE_PBB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
 +{
 +    int  z,d;
 +    real cl,ca,za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for(z=0; z<zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int iloc,
 +                            real rlist,
 +                            int min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones,ls,r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est,nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
 +
 +        /* Since the target value is a maximum (this avoid high outliers,
 +         * which lead to load imbalance), not average, we get more lists
 +         * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
 +         * But more importantly, the optimal GPU performance moves
 +         * to lower number of block for very small blocks.
 +         * To compensate we add the maximum pair count per cj4.
 +         */
 +        nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est,nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j;
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci,nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp,"  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j4,j;
 +
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci,nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                fprintf(fp,"  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
 +            }
 +        }
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci,ncj4,nexcl;
 +    int n,i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for(i=0; i<nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc,nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc,nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc,nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(n=0; n<nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i,j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for(i=0; i<n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for(i=0; i<nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for(n=0; n<nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth,int ci_block,
 +                        int *ci_x,int *ci_y,
 +                        int *ci_b,int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real rlist,
 +                                        gmx_bool simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx,bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
 +                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum = 5;
 +    const int ci_block_denom = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +    
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th,int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
 +    ivec shp;
 +    int  tx,ty,tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx,shy,shz;
 +    int  conv_i,cell0_i;
 +    const float *bb_i,*bbcz_i,*bbcz_j;
 +    const int *flags_i;
 +    real bx0,bx1,by0,by1,bz0,bz1;
 +    real bz1_frac;
 +    real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
 +    int  cxf,cxl,cyf,cyf_x,cyl;
 +    int  cx,cy;
 +    int  c0,c1,cs,cf,cl;
 +    int  ndistc;
 +    int  ncpcheck;
 +    int  gridi_flag_shift=0,gridj_flag_shift=0;
 +    unsigned *gridj_flag=NULL;
 +    int  ncj_old_i,ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
 +        /* Determine conversion of clusters to flag blocks */
 +        gridi_flag_shift = 0;
 +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridi_flag_shift++;
 +        }
 +        gridj_flag_shift = 0;
 +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
 +
 +        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box,box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for(d=0; d<DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
 +        bb_i    = gridi->bb;
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
 +    }
 +
 +    ndistc = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
 +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty=-shp[YY]; ty<=shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
 +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
 +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0,by1,
 +                               gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
 +                               d2z_cx,rl2,
 +                               &cyf,&cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx=-shp[XX]; tx<=shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx,ty,tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
 +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0,bx1,
 +                                   gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
 +                                   d2z_cy,rl2,
 +                                   &cxf,&cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                     nbl->work);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                      nbl->work);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i,ci,shx,shy,shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
 +                        set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
 +                                              nbl->work->bb_ci);
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
 +                                     gridi->na_c,nbat->xstride,nbat->x,
 +                                     nbl->work);
 +
 +                    for(cx=cxf; cx<=cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for(cy=cyf_x; cy<=cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while(cf > c0 &&
 +                                      (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                       d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while(cl < c1-1 &&
 +                                      (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                       d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for(k=c0; k<c1; k++)
 +                                    {
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf,ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf,ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
 +                                    case nbnxnk4x4_PlainC:
 +                                        check_subcell_list_space_simple(nbl,cl-cf+1);
 +
 +                                        make_cluster_list_simple(gridj,
 +                                                                 nbl,ci,cf,cl,
 +                                                                 (gridi == gridj && shift == CENTRAL),
 +                                                                 nbat->x,
 +                                                                 rl2,rbb2,
 +                                                                 &ndistc);
 +                                        break;
 +#ifdef GMX_NBNXN_SIMD_4XN
 +                                    case nbnxnk4xN_SIMD_4xN:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
 +                                        make_cluster_list_simd_4xn(gridj,
 +                                                                   nbl,ci,cf,cl,
 +                                                                   (gridi == gridj && shift == CENTRAL),
 +                                                                   nbat->x,
 +                                                                   rl2,rbb2,
 +                                                                   &ndistc);
 +                                        break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +                                    case nbnxnk4xN_SIMD_2xNN:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
 +                                        make_cluster_list_simd_2xnn(gridj,
 +                                                                   nbl,ci,cf,cl,
 +                                                                   (gridi == gridj && shift == CENTRAL),
 +                                                                   nbat->x,
 +                                                                   rl2,rbb2,
 +                                                                   &ndistc);
 +                                        break;
 +#endif
 +                                    case nbnxnk8x8x8_PlainC:
 +                                    case nbnxnk8x8x8_CUDA:
 +                                        check_subcell_list_space_supersub(nbl,cl-cf+1);
 +                                        for(cj=cf; cj<=cl; cj++)
 +                                        {
 +                                            make_cluster_list_supersub(nbs,gridi,gridj,
 +                                                                       nbl,ci,cj,
 +                                                                       (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                       nbat->xstride,nbat->x,
 +                                                                       rl2,rbb2,
 +                                                                       &ndistc);
 +                                        }
 +                                        break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf,cbl,cb;
 +
 +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
 +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for(cb=cbf; cb<=cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal,min_ci_balanced,
 +                                                th,nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
 +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"number of distance checks %d\n",ndistc);
 +        fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl,nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
 +        }
 +
 +    }
 +}
 +
 +static void reduce_buffer_flags(const nbnxn_search_t nbs,
 +                                int nsrc,
 +                                const nbnxn_buffer_flags_t *dest)
 +{
 +    int s,b;
 +    const unsigned *flag;
 +
 +    for(s=0; s<nsrc; s++)
 +    {
 +        flag = nbs->work[s].buffer_flags.flag;
 +
 +        for(b=0; b<dest->nflag; b++)
 +        {
 +            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
 +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
 +{
 +    int nelem,nkeep,ncopy,nred,b,c,out;
 +
 +    nelem = 0;
 +    nkeep = 0;
 +    ncopy = 0;
 +    nred  = 0;
 +    for(b=0; b<flags->nflag; b++)
 +    {
 +        if (flags->flag[b] == 1)
 +        {
 +            /* Only flag 0 is set, no copy of reduction required */
 +            nelem++;
 +            nkeep++;
 +        }
 +        else if (flags->flag[b] > 0)
 +        {
 +            c = 0;
 +            for(out=0; out<nout; out++)
 +            {
 +                if (flags->flag[b] & (1U<<out))
 +                {
 +                    c++;
 +                }
 +            }
 +            nelem += c;
 +            if (c == 1)
 +            {
 +                ncopy++;
 +            }
 +            else
 +            {
 +                nred += c;
 +            }
 +        }
 +    }
 +
 +    fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
 +            flags->nflag,nout,
 +            nelem/(double)(flags->nflag),
 +            nkeep/(double)(flags->nflag),
 +            ncopy/(double)(flags->nflag),
 +            nred/(double)(flags->nflag));
 +}
 +
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t nbs,
 +                         nbnxn_atomdata_t *nbat,
 +                         const t_blocka *excl,
 +                         real rlist,
 +                         int min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int iloc,
 +                         int nb_kernel_type,
 +                         t_nrnb *nrnb)
 +{
 +    nbnxn_grid_t *gridi,*gridj;
++    gmx_bool bGPUCPU;
 +    int nzi,zi,zj0,zj1,zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
 +    gmx_bool CombineNBLists;
 +    int np_tot,np_noq,np_hlj,nap;
 +
++    /* Check if we are running hybrid GPU + CPU nbnxn mode */
++    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
++
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns making %d nblists\n", nnbl);
 +    }
 +
 +    nbat->bUseBufferFlags = (nbat->nout > 1);
-                 if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
++    /* We should re-init the flags before making the first list */
++    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
 +    {
 +        init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
 +    }
 +
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +        case nbnxnk4xN_SIMD_4xN:
 +            nbs->icell_set_x = icell_set_x_simd_4xn;
 +            break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +        case nbnxnk4xN_SIMD_2xNN:
 +            nbs->icell_set_x = icell_set_x_simd_2xnn;
 +            break;
 +#endif
 +        default:
 +            nbs->icell_set_x = icell_set_x_simple;
 +            break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for(th=0; th<nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for(zi=0; zi<nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for(zj=zj0; zj<zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
 +            }
 +            else
 +            {
 +                ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for(th=0; th<nnbl; th++)
 +            {
++                /* Re-init the thread-local work flag data before making
++                 * the first list (not an elegant conditional).
++                 */
++                if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
++                                              (bGPUCPU && zi == 0 && zj == 1)))
 +                {
 +                    init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
 +                }
 +
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs,gridi,gridj,
 +                                         &nbs->work[th],nbat,excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
 +                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
 +                                         (LOCAL_I(iloc) || nbs->zones->n <= 2),
 +                                         min_ci_balanced,
 +                                         th,nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for(th=0; th<nnbl; th++)
 +            {
 +                inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1,nbl+1,nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
 +        }
 +    }
 +
 +    if (nbat->bUseBufferFlags)
 +    {
 +        reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
 +    }
 +
 +    /*
 +    print_supersub_nsp("nsubpair",nbl[0],iloc);
 +    */
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr,nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (gmx_debug_at)
 +        {
 +            if (nbl[0]->bSimple)
 +            {
 +                print_nblist_ci_cj(debug,nbl[0]);
 +            }
 +            else
 +            {
 +                print_nblist_sci_cj(debug,nbl[0]);
 +            }
 +        }
 +
 +        if (nbat->bUseBufferFlags)
 +        {
 +            print_reduction_cost(&nbat->buffer_flags,nnbl);
 +        }
 +    }
 +}
index 5ef37b5f5cdfa51e548b46831203edddb9e4ffd3,0000000000000000000000000000000000000000..52b376493e0ff35d8cd5e531d8637a2bef3ed913
mode 100644,000000..100644
--- /dev/null
@@@ -1,287 -1,0 +1,282 @@@
- #ifndef F77_FUNC
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_QMMM_GAMESS
 +
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "physics.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "force.h"
 +#include "invblock.h"
 +#include "confio.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "copyrite.h"
 +#include "qmmm.h"
 +#include <stdio.h>
 +#include <string.h>
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include <stdlib.h>
 +
 +
 +/* QMMM sub routines */
 +/* mopac interface routines */
 +
 +
 +void 
 +F77_FUNC(inigms,IMIGMS)(void);
 +
 +void 
 +F77_FUNC(endgms,ENDGMS)(void);
 +
 +void 
 +F77_FUNC(grads,GRADS)(int *nrqmat,real *qmcrd,int *nrmmat, real *mmchrg, 
 +                      real *mmcrd, real *qmgrad,real *mmgrad, real *energy);
 +
 +
 +
 +void init_gamess(t_commrec *cr,t_QMrec *qm, t_MMrec *mm){
 +  /* it works hopelessly complicated :-)
 +   * first a file is written. Then the standard gamess input/output
 +   * routine is called (no system()!) to set up all fortran arrays. 
 +   * this routine writes a punch file, like in a normal gamess run.
 +   * via this punch file the other games routines, needed for gradient
 +   * and energy evaluations are called. This setup works fine for 
 +   * dynamics simulations. 7-6-2002 (London)
 +   */
 +  int 
 +    i,j,rank;
 +  FILE
 +    *out;
 +  char
 +    periodic_system[37][3]={"XX","H ","He","Li","Be","B ","C ","N ",
 +                          "O ","F ","Ne","Na","Mg","Al","Si","P ",
 +                          "S ","Cl","Ar","K ","Ca","Sc","Ti","V ",
 +                          "Cr","Mn","Fe","Co","Ni","Cu","Zn","Ga",
 +                          "Ge","As","Se","Br","Kr"};
 +  
 +  if (PAR(cr)){
 +
 +    if MASTER(cr){
 +      out=fopen("FOR009","w");
 +      /* of these options I am not completely sure....  the overall
 +       * preformance on more than 4 cpu's is rather poor at the moment.  
 +       */
 +      fprintf(out,"memory 48000000\nPARALLEL IOMODE SCREENED\n");
 +      fprintf(out,"ELEC %d\nMULT %d\nSUPER ON\nNOSYM\nGEOMETRY ANGSTROM\n",
 +            qm->nelectrons,qm->multiplicity);
 +      for (i=0;i<qm->nrQMatoms;i++){
 +#ifdef DOUBLE
 +      fprintf(out,"%10.7lf  %10.7lf  %10.7lf  %5.3lf  %2s\n",
 +              i/2.,
 +              i/3.,
 +              i/4.,  
 +              qm->atomicnumberQM[i]*1.0,
 +              periodic_system[qm->atomicnumberQM[i]]);
 +#else
 +      fprintf(out,"%10.7f  %10.7f  %10.7f  %5.3f  %2s\n",
 +              i/2.,
 +              i/3.,
 +              i/4.,  
 +              qm->atomicnumberQM[i]*1.0,
 +              periodic_system[qm->atomicnumberQM[i]]);
 +#endif
 +      }
 +      if(mm->nrMMatoms){
 +      for (j=i;j<i+2;j++){
 +#ifdef DOUBLE
 +        fprintf(out,"%10.7lf  %10.7lf  %10.7lf  %5.3lf  BQ\n",
 +                j/5.,
 +                j/6.,
 +                j/7.,
 +                1.0);  
 +#else
 +        fprintf(out,"%10.7f  %10.7f  %10.7f  %5.3f  BQ\n",
 +                j/5.,
 +                j/6.,
 +                j/7.,
 +                2.0);  
 +#endif
 +      }
 +      }
 +      if(!qm->bTS)
 +      fprintf(out,"END\nBASIS %s\nRUNTYPE GRADIENT\nSCFTYPE %s\n",
 +              eQMbasis_names[qm->QMbasis],
 +              eQMmethod_names[qm->QMmethod]); /* see enum.h */
 +      else
 +      fprintf(out,"END\nBASIS %s\nRUNTYPE SADDLE\nSCFTYPE %s\n",
 +              eQMbasis_names[qm->QMbasis],
 +              eQMmethod_names[qm->QMmethod]); /* see enum.h */
 +      fclose(out);
 +    }
 +    gmx_barrier(cr);
 +    F77_FUNC(inigms,IMIGMS)();
 +  }
 +  else{ /* normal serial run */
 +    
 +    out=fopen("FOR009","w");
 +    /* of these options I am not completely sure....  the overall
 +     * preformance on more than 4 cpu's is rather poor at the moment.  
 +     */
 +    fprintf(out,"ELEC %d\nMULT %d\nSUPER ON\nNOSYM\nGEOMETRY ANGSTROM\n",
 +          qm->nelectrons,qm->multiplicity);
 +    for (i=0;i<qm->nrQMatoms;i++){
 +#ifdef DOUBLE
 +      fprintf(out,"%10.7lf  %10.7lf  %10.7lf  %5.3lf  %2s\n",
 +            i/2.,
 +            i/3.,
 +            i/4.,  
 +            qm->atomicnumberQM[i]*1.0,
 +            periodic_system[qm->atomicnumberQM[i]]);
 +#else
 +      fprintf(out,"%10.7f  %10.7f  %10.7f  %5.3f  %2s\n",
 +            i/2.,
 +            i/3.,
 +            i/4.,  
 +            qm->atomicnumberQM[i]*1.0,
 +            periodic_system[qm->atomicnumberQM[i]]);
 +#endif
 +    }
 +    if(mm->nrMMatoms){
 +      for (j=i;j<i+2;j++){
 +#ifdef DOUBLE
 +      fprintf(out,"%10.7lf  %10.7lf  %10.7lf  %5.3lf  BQ\n",
 +              j/5.,
 +              j/6.,
 +              j/7.,
 +              1.0);  
 +#else
 +      fprintf(out,"%10.7f  %10.7f  %10.7f  %5.3f  BQ\n",
 +              j/5.,
 +              j/6.,
 +              j/7.,
 +              2.0);  
 +#endif
 +      }
 +    }
 +    if(!qm->bTS)
 +      fprintf(out,"END\nBASIS %s\nRUNTYPE GRADIENT\nSCFTYPE %s\n",
 +            eQMbasis_names[qm->QMbasis],
 +            eQMmethod_names[qm->QMmethod]); /* see enum.h */
 +    else
 +      fprintf(out,"END\nBASIS %s\nRUNTYPE SADDLE\nSCFTYPE %s\n",
 +            eQMbasis_names[qm->QMbasis],
 +            eQMmethod_names[qm->QMmethod]); /* see enum.h */
 +    F77_FUNC(inigms,IMIGMS)();
 +  }  
 +}
 +
 +real call_gamess(t_commrec *cr, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm, 
 +               rvec f[], rvec fshift[])
 +{
 +  /* do the actual QMMM calculation using GAMESS-UK. In this
 +   * implementation (3-2001) a system call is made to the GAMESS-UK
 +   * binary. Now we are working to get the electron integral, SCF, and
 +   * gradient routines linked directly 
 +   */
 +  int 
 +    i,j,rank;
 +  real
 +    QMener=0.0,*qmgrad,*mmgrad,*mmcrd,*qmcrd,energy;
 +  t_QMMMrec
 +    *qr;
 +
 +  /* copy the QMMMrec pointer */
 +  qr = fr->qr;
 +  snew(qmcrd, 3*(qm->nrQMatoms));
 +  snew(mmcrd,3*(mm->nrMMatoms));
 +  snew(qmgrad,3*(qm->nrQMatoms));
 +  snew(mmgrad,3*(mm->nrMMatoms));
 +  
 +  /* copy the data from qr into the arrays that are going to be used
 +   * in the fortran routines of gamess
 +   */
 +  for(i=0;i<qm->nrQMatoms;i++){
 +    for (j=0;j<DIM;j++){
 +      qmcrd[DIM*i+j] = 1/BOHR2NM*qm->xQM[i][j];
 +    }
 +  }
 +  for(i=0;i<mm->nrMMatoms;i++){
 +    for (j=0;j<DIM;j++){
 +      mmcrd[DIM*i+j] = 1/BOHR2NM*mm->xMM[i][j];
 +    }
 +  }
 +  for (i=0;i<3*qm->nrQMatoms;i+=3){
 +    fprintf(stderr,"%8.5f, %8.5f, %8.5f\n",
 +          qmcrd[i],
 +          qmcrd[i+1],
 +          qmcrd[i+2]);
 +  }
 +
 +  F77_FUNC(grads,GRADS)(&qm->nrQMatoms,qmcrd,&mm->nrMMatoms,mm->MMcharges,
 +                        mmcrd,qmgrad,mmgrad,&energy);
 +
 +  for(i=0;i<qm->nrQMatoms;i++){
 +    for(j=0;j<DIM;j++){
 +      f[i][j]      = HARTREE_BOHR2MD*qmgrad[3*i+j];
 +      fshift[i][j] = HARTREE_BOHR2MD*qmgrad[3*i+j];
 +    }
 +  }
 +  for(i=0;i<mm->nrMMatoms;i++){
 +    for(j=0;j<DIM;j++){
 +      f[i][j]      = HARTREE_BOHR2MD*mmgrad[3*i+j];
 +      fshift[i][j] = HARTREE_BOHR2MD*mmgrad[3*i+j];
 +    }
 +  }
 +  /* convert a.u to kJ/mol */
 +  QMener=energy*HARTREE2KJ*AVOGADRO;
 +  return(QMener);
 +}
 +
 +#else
 +int
 +gmx_qmmm_gamess_empty;
 +#endif
 +
index d11e9a25f37aae40942e2753842287f1292142df,0000000000000000000000000000000000000000..546f94bb14a12aa3c924eed30db5c58a4d07e7d7
mode 100644,000000..100644
--- /dev/null
@@@ -1,230 -1,0 +1,224 @@@
- #ifndef F77_FUNC
- #define F77_FUNC(name,NAME) name ## _
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_QMMM_MOPAC
 +
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "physics.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "force.h"
 +#include "invblock.h"
 +#include "confio.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "copyrite.h"
 +#include "qmmm.h"
 +#include <stdio.h>
 +#include <string.h>
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include <stdlib.h>
 +
 +
 +/* mopac interface routines */
 +void 
 +F77_FUNC(domldt,DOMLDT)(int *nrqmat, int labels[], char keywords[]);
 +
 +void 
 +F77_FUNC(domop,DOMOP)(int *nrqmat,double qmcrd[],int *nrmmat,
 +                      double mmchrg[],double mmcrd[],double qmgrad[],
 +                      double mmgrad[], double *energy,double qmcharges[]);
 +
 +
 +
 +void init_mopac(t_commrec *cr, t_QMrec *qm, t_MMrec *mm)
 +{
 +  /* initializes the mopac routines ans sets up the semiempirical
 +   * computation by calling moldat(). The inline mopac routines can
 +   * only perform gradient operations. If one would like to optimize a
 +   * structure or find a transition state at PM3 level, gaussian is
 +   * used instead.
 +   */
 +  char 
 +    *keywords;
 +  
 +  snew(keywords,240);
 +  
 +  if(!qm->bSH){    /* if rerun then grad should not be done! */
 +    sprintf(keywords,"PRECISE GEO-OK CHARGE=%d GRAD MMOK ANALYT %s\n",
 +          qm->QMcharge,
 +          eQMmethod_names[qm->QMmethod]);
 +  }
 +  else
 +    sprintf(keywords,"PRECISE GEO-OK CHARGE=%d SINGLET GRAD %s C.I.=(%d,%d) root=2 MECI \n",
 +          qm->QMcharge,
 +          eQMmethod_names[qm->QMmethod],
 +          qm->CASorbitals,qm->CASelectrons/2);
 +  F77_FUNC(domldt,DOMLDT)(&qm->nrQMatoms,qm->atomicnumberQM,keywords);
 +  fprintf(stderr,"keywords are: %s\n",keywords);
 +  free(keywords);
 +  
 +} /* init_mopac */
 +
 +real call_mopac(t_commrec *cr, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm, 
 +              rvec f[], rvec fshift[])
 +{
 +  /* do the actual QMMM calculation using directly linked mopac subroutines 
 +   */
 +  double /* always double as the MOPAC routines are always compiled in
 +          double precission! */
 +    *qmcrd=NULL,*qmchrg=NULL,*mmcrd=NULL,*mmchrg=NULL,
 +    *qmgrad,*mmgrad=NULL,energy;
 +  int
 +    i,j;
 +  real
 +    QMener=0.0;
 +  snew(qmcrd, 3*(qm->nrQMatoms));
 +  snew(qmgrad,3*(qm->nrQMatoms));
 +  /* copy the data from qr into the arrays that are going to be used
 +   * in the fortran routines of MOPAC
 +   */
 +  for(i=0;i<qm->nrQMatoms;i++){
 +    for (j=0;j<DIM;j++){
 +      qmcrd[3*i+j] = (double)qm->xQM[i][j]*10;
 +    }
 +  }
 +  if(mm->nrMMatoms){
 +    /* later we will add the point charges here. There are some
 +     * conceptual problems with semi-empirical QM in combination with
 +     * point charges that we need to solve first....  
 +     */
 +    gmx_fatal(FARGS,"At present only ONIOM is allowed in combination"
 +              " with MOPAC QM subroutines\n");
 +  }
 +  else {
 +    /* now compute the energy and the gradients.
 +     */
 +      
 +    snew(qmchrg,qm->nrQMatoms);    
 +    F77_FUNC(domop,DOMOP)(&qm->nrQMatoms,qmcrd,&mm->nrMMatoms,
 +         mmchrg,mmcrd,qmgrad,mmgrad,&energy,qmchrg);
 +    /* add the gradients to the f[] array, and also to the fshift[].
 +     * the mopac gradients are in kCal/angstrom.
 +     */
 +    for(i=0;i<qm->nrQMatoms;i++){
 +      for(j=0;j<DIM;j++){
 +      f[i][j]       = (real)10*CAL2JOULE*qmgrad[3*i+j];
 +      fshift[i][j]  = (real)10*CAL2JOULE*qmgrad[3*i+j];
 +      }
 +    }
 +    QMener = (real)CAL2JOULE*energy;
 +    /* do we do something with the mulliken charges?? */
 +
 +    free(qmchrg);
 +}
 +  free(qmgrad);
 +  free(qmcrd);
 +  return (QMener);
 +}
 +
 +real call_mopac_SH(t_commrec *cr, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm, 
 +                 rvec f[], rvec fshift[])
 +{
 +  /* do the actual SH QMMM calculation using directly linked mopac
 +   subroutines */
 +
 +  double /* always double as the MOPAC routines are always compiled in
 +          double precission! */
 +    *qmcrd=NULL,*qmchrg=NULL,*mmcrd=NULL,*mmchrg=NULL,
 +    *qmgrad,*mmgrad=NULL,energy;
 +  int
 +    i,j;
 +  real
 +    QMener=0.0;
 +
 +  snew(qmcrd, 3*(qm->nrQMatoms));
 +  snew(qmgrad,3*(qm->nrQMatoms));
 +  /* copy the data from qr into the arrays that are going to be used
 +   * in the fortran routines of MOPAC
 +   */
 +  for(i=0;i<qm->nrQMatoms;i++){
 +    for (j=0;j<DIM;j++){
 +      qmcrd[3*i+j] = (double)qm->xQM[i][j]*10;
 +    }
 +  }
 +  if(mm->nrMMatoms){
 +    /* later we will add the point charges here. There are some
 +     * conceptual problems with semi-empirical QM in combination with
 +     * point charges that we need to solve first....  
 +     */
 +    gmx_fatal(FARGS,"At present only ONIOM is allowed in combination with MOPAC\n");
 +  }
 +  else {
 +    /* now compute the energy and the gradients.
 +     */
 +    snew(qmchrg,qm->nrQMatoms);    
 +
 +    F77_FUNC(domop,DOMOP)(&qm->nrQMatoms,qmcrd,&mm->nrMMatoms,
 +         mmchrg,mmcrd,qmgrad,mmgrad,&energy,qmchrg);
 +    /* add the gradients to the f[] array, and also to the fshift[].
 +     * the mopac gradients are in kCal/angstrom.
 +     */
 +    for(i=0;i<qm->nrQMatoms;i++){
 +      for(j=0;j<DIM;j++){
 +      f[i][j]      = (real)10*CAL2JOULE*qmgrad[3*i+j];
 +      fshift[i][j] = (real)10*CAL2JOULE*qmgrad[3*i+j];
 +      }
 +    }
 +    QMener = (real)CAL2JOULE*energy;
 +  }
 +  free(qmgrad);
 +  free(qmcrd);
 +  return (QMener);
 +} /* call_mopac_SH */
 +
 +#else
 +int
 +gmx_qmmm_mopac_empty;
 +#endif
index 932fc34f225b47acba1c4b643a6cbb580bda7676,0000000000000000000000000000000000000000..87cf5630e1618bcb78e068bbd09383597a10e290
mode 100644,000000..100644
--- /dev/null
@@@ -1,1952 -1,0 +1,2047 @@@
-   /* The random state */
-   gmx_rng_t gaussrand;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <math.h>
 +
 +#include "types/commrec.h"
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "nrnb.h"
 +#include "physics.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "main.h"
 +#include "confio.h"
 +#include "update.h"
 +#include "gmx_random.h"
 +#include "futil.h"
 +#include "mshift.h"
 +#include "tgroup.h"
 +#include "force.h"
 +#include "names.h"
 +#include "txtdump.h"
 +#include "mdrun.h"
 +#include "copyrite.h"
 +#include "constr.h"
 +#include "edsam.h"
 +#include "pull.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_omp_nthreads.h"
++#include "gmx_omp.h"
 +
 +/*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
 +/*#define STARTFROMDT2*/
 +
 +typedef struct {
 +  double gdt;
 +  double eph;
 +  double emh;
 +  double em;
 +  double b;
 +  double c;
 +  double d;
 +} gmx_sd_const_t;
 +
 +typedef struct {
 +  real V;
 +  real X;
 +  real Yv;
 +  real Yx;
 +} gmx_sd_sigma_t;
 +
 +typedef struct {
- static gmx_stochd_t *init_stochd(FILE *fplog,t_inputrec *ir)
++  /* The random state for ngaussrand threads.
++   * Normal thermostats need just 1 random number generator,
++   * but SD and BD with OpenMP parallelization need 1 for each thread.
++   */
++  int ngaussrand;
++  gmx_rng_t *gaussrand;
 +  /* BD stuff */
 +  real *bd_rf;
 +  /* SD stuff */
 +  gmx_sd_const_t *sdc;
 +  gmx_sd_sigma_t *sdsig;
 +  rvec *sd_V;
 +  int  sd_V_nalloc;
 +  /* andersen temperature control stuff */
 +  gmx_bool *randomize_group;
 +  real *boltzfac;
 +} gmx_stochd_t;
 +
 +typedef struct gmx_update
 +{
 +    gmx_stochd_t *sd;
 +    /* xprime for constraint algorithms */
 +    rvec *xp;
 +    int  xp_nalloc;
 +
 +    /* variable size arrays for andersen */
 +    gmx_bool *randatom;
 +    int *randatom_list;
 +    gmx_bool randatom_list_init;
 +
 +    /* Variables for the deform algorithm */
 +    gmx_large_int_t deformref_step;
 +    matrix     deformref_box;
 +} t_gmx_update;
 +
 +
 +static void do_update_md(int start,int nrend,double dt,
 +                         t_grp_tcstat *tcstat,
 +                         double nh_vxi[],
 +                         gmx_bool bNEMD,t_grp_acc *gstat,rvec accel[],
 +                         ivec nFreeze[],
 +                         real invmass[],
 +                         unsigned short ptype[],unsigned short cFREEZE[],
 +                         unsigned short cACC[],unsigned short cTC[],
 +                         rvec x[],rvec xprime[],rvec v[],
 +                         rvec f[],matrix M,
 +                         gmx_bool bNH,gmx_bool bPR)
 +{
 +  double imass,w_dt;
 +  int    gf=0,ga=0,gt=0;
 +  rvec   vrel;
 +  real   vn,vv,va,vb,vnrel;
 +  real   lg,vxi=0,u;
 +  int    n,d;
 +
 +  if (bNH || bPR)
 +  {
 +      /* Update with coupling to extended ensembles, used for
 +       * Nose-Hoover and Parrinello-Rahman coupling
 +       * Nose-Hoover uses the reversible leap-frog integrator from
 +       * Holian et al. Phys Rev E 52(3) : 2338, 1995
 +       */
 +      for(n=start; n<nrend; n++)
 +      {
 +          imass = invmass[n];
 +          if (cFREEZE)
 +          {
 +              gf   = cFREEZE[n];
 +          }
 +          if (cACC)
 +          {
 +              ga   = cACC[n];
 +          }
 +          if (cTC)
 +          {
 +              gt   = cTC[n];
 +          }
 +          lg   = tcstat[gt].lambda;
 +          if (bNH) {
 +              vxi   = nh_vxi[gt];
 +          }
 +          rvec_sub(v[n],gstat[ga].u,vrel);
 +
 +          for(d=0; d<DIM; d++)
 +          {
 +              if((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +              {
 +                  vnrel = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
 +                                            - iprod(M[d],vrel)))/(1 + 0.5*vxi*dt);
 +                  /* do not scale the mean velocities u */
 +                  vn             = gstat[ga].u[d] + accel[ga][d]*dt + vnrel;
 +                  v[n][d]        = vn;
 +                  xprime[n][d]   = x[n][d]+vn*dt;
 +              }
 +              else
 +              {
 +                  v[n][d]        = 0.0;
 +                  xprime[n][d]   = x[n][d];
 +              }
 +          }
 +      }
 +  } 
 +  else if (cFREEZE != NULL ||
 +           nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
 +           bNEMD)
 +  {
 +      /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
 +      for(n=start; n<nrend; n++) 
 +      {
 +          w_dt = invmass[n]*dt;
 +          if (cFREEZE)
 +          {
 +              gf   = cFREEZE[n];
 +          }
 +          if (cACC)
 +          {
 +              ga   = cACC[n];
 +          }
 +          if (cTC)
 +          {
 +              gt   = cTC[n];
 +          }
 +          lg   = tcstat[gt].lambda;
 +
 +          for(d=0; d<DIM; d++)
 +          {
 +              vn             = v[n][d];
 +              if((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +              {
 +                  vv             = lg*vn + f[n][d]*w_dt;
 +
 +                  /* do not scale the mean velocities u */
 +                  u              = gstat[ga].u[d];
 +                  va             = vv + accel[ga][d]*dt;
 +                  vb             = va + (1.0-lg)*u;
 +                  v[n][d]        = vb;
 +                  xprime[n][d]   = x[n][d]+vb*dt;
 +              }
 +              else
 +              {
 +                  v[n][d]        = 0.0;
 +                  xprime[n][d]   = x[n][d];
 +              }
 +          }
 +      }
 +  }
 +    else
 +    {
 +        /* Plain update with Berendsen/v-rescale coupling */
 +        for(n=start; n<nrend; n++) 
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +            {
 +                w_dt = invmass[n]*dt;
 +                if (cTC)
 +                {
 +                    gt = cTC[n];
 +                }
 +                lg = tcstat[gt].lambda;
 +
 +                for(d=0; d<DIM; d++)
 +                {
 +                    vn           = lg*v[n][d] + f[n][d]*w_dt;
 +                    v[n][d]      = vn;
 +                    xprime[n][d] = x[n][d] + vn*dt;
 +                }
 +            }
 +            else
 +            {
 +                for(d=0; d<DIM; d++)
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void do_update_vv_vel(int start,int nrend,double dt,
 +                             t_grp_tcstat *tcstat,t_grp_acc *gstat,
 +                             rvec accel[],ivec nFreeze[],real invmass[],
 +                             unsigned short ptype[],unsigned short cFREEZE[],
 +                             unsigned short cACC[],rvec v[],rvec f[],
 +                             gmx_bool bExtended, real veta, real alpha)
 +{
 +    double imass,w_dt;
 +    int    gf=0,ga=0;
 +    rvec   vrel;
 +    real   u,vn,vv,va,vb,vnrel;
 +    int    n,d;
 +    double g,mv1,mv2;
 +
 +    if (bExtended)
 +    {
 +        g        = 0.25*dt*veta*alpha;
 +        mv1      = exp(-g);
 +        mv2      = series_sinhx(g);
 +    }
 +    else
 +    {
 +        mv1      = 1.0;
 +        mv2      = 1.0;
 +    }
 +    for(n=start; n<nrend; n++)
 +    {
 +        w_dt = invmass[n]*dt;
 +        if (cFREEZE)
 +        {
 +            gf   = cFREEZE[n];
 +        }
 +        if (cACC)
 +        {
 +            ga   = cACC[n];
 +        }
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            if((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                v[n][d]             = mv1*(mv1*v[n][d] + 0.5*(w_dt*mv2*f[n][d]))+0.5*accel[ga][d]*dt;
 +            }
 +            else
 +            {
 +                v[n][d]        = 0.0;
 +            }
 +        }
 +    }
 +} /* do_update_vv_vel */
 +
 +static void do_update_vv_pos(int start,int nrend,double dt,
 +                             t_grp_tcstat *tcstat,t_grp_acc *gstat,
 +                             rvec accel[],ivec nFreeze[],real invmass[],
 +                             unsigned short ptype[],unsigned short cFREEZE[],
 +                             rvec x[],rvec xprime[],rvec v[],
 +                             rvec f[],gmx_bool bExtended, real veta, real alpha)
 +{
 +  double imass,w_dt;
 +  int    gf=0;
 +  int    n,d;
 +  double g,mr1,mr2;
 +
 +  /* Would it make more sense if Parrinello-Rahman was put here? */
 +  if (bExtended)
 +  {
 +      g        = 0.5*dt*veta;
 +      mr1      = exp(g);
 +      mr2      = series_sinhx(g);
 +  } else {
 +      mr1      = 1.0;
 +      mr2      = 1.0;
 +  }
 +
 +  for(n=start; n<nrend; n++) {
 +
 +      if (cFREEZE)
 +      {
 +          gf   = cFREEZE[n];
 +      }
 +
 +      for(d=0; d<DIM; d++)
 +      {
 +          if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +          {
 +              xprime[n][d]   = mr1*(mr1*x[n][d]+mr2*dt*v[n][d]);
 +          }
 +          else
 +          {
 +              xprime[n][d]   = x[n][d];
 +          }
 +      }
 +  }
 +}/* do_update_vv_pos */
 +
 +static void do_update_visc(int start,int nrend,double dt,
 +                           t_grp_tcstat *tcstat,
 +                           double nh_vxi[],
 +                           real invmass[],
 +                           unsigned short ptype[],unsigned short cTC[],
 +                           rvec x[],rvec xprime[],rvec v[],
 +                           rvec f[],matrix M,matrix box,real
 +                           cos_accel,real vcos,
 +                           gmx_bool bNH,gmx_bool bPR)
 +{
 +    double imass,w_dt;
 +    int    gt=0;
 +    real   vn,vc;
 +    real   lg,vxi=0,vv;
 +    real   fac,cosz;
 +    rvec   vrel;
 +    int    n,d;
 +
 +    fac = 2*M_PI/(box[ZZ][ZZ]);
 +
 +    if (bNH || bPR) {
 +        /* Update with coupling to extended ensembles, used for
 +         * Nose-Hoover and Parrinello-Rahman coupling
 +         */
 +        for(n=start; n<nrend; n++) {
 +            imass = invmass[n];
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +            cosz = cos(fac*x[n][ZZ]);
 +
 +            copy_rvec(v[n],vrel);
 +
 +            vc            = cosz*vcos;
 +            vrel[XX]     -= vc;
 +            if (bNH)
 +            {
 +                vxi        = nh_vxi[gt];
 +            }
 +            for(d=0; d<DIM; d++)
 +            {
 +                vn             = v[n][d];
 +
 +                if((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +                {
 +                    vn  = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
 +                                            - iprod(M[d],vrel)))/(1 + 0.5*vxi*dt);
 +                    if(d == XX)
 +                    {
 +                        vn += vc + dt*cosz*cos_accel;
 +                    }
 +                    v[n][d]        = vn;
 +                    xprime[n][d]   = x[n][d]+vn*dt;
 +                }
 +                else
 +                {
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Classic version of update, used with berendsen coupling */
 +        for(n=start; n<nrend; n++)
 +        {
 +            w_dt = invmass[n]*dt;
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +            cosz = cos(fac*x[n][ZZ]);
 +
 +            for(d=0; d<DIM; d++)
 +            {
 +                vn             = v[n][d];
 +
 +                if((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +                {
 +                    if(d == XX)
 +                    {
 +                        vc           = cosz*vcos;
 +                        /* Do not scale the cosine velocity profile */
 +                        vv           = vc + lg*(vn - vc + f[n][d]*w_dt);
 +                        /* Add the cosine accelaration profile */
 +                        vv          += dt*cosz*cos_accel;
 +                    }
 +                    else
 +                    {
 +                        vv           = lg*(vn + f[n][d]*w_dt);
 +                    }
 +                    v[n][d]        = vv;
 +                    xprime[n][d]   = x[n][d]+vv*dt;
 +                }
 +                else
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +}
 +
-     int  ngtc,n;
++/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
++ * Using seeds generated from sd->gaussrand[0].
++ */
++static void init_multiple_gaussrand(gmx_stochd_t *sd)
++{
++    int          ngr,i;
++    unsigned int *seed;
++
++    ngr = sd->ngaussrand;
++    snew(seed,ngr);
++
++    for(i=1; i<ngr; i++)
++    {
++        seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
++    }
++
++#pragma omp parallel num_threads(ngr)
++    {
++        int th;
++
++        th = gmx_omp_get_thread_num();
++        if (th > 0)
++        {
++            /* Initialize on each thread to have thread-local memory alloced */
++            sd->gaussrand[th] = gmx_rng_init(seed[th]);
++        }
++    }
++
++    sfree(seed);
++}
++
++static gmx_stochd_t *init_stochd(FILE *fplog,t_inputrec *ir,int nthreads)
 +{
 +    gmx_stochd_t *sd;
 +    gmx_sd_const_t *sdc;
-     sd->gaussrand = gmx_rng_init(ir->ld_seed);
++    int  ngtc,n,th;
 +    real y;
 +
 +    snew(sd,1);
 +
 +    /* Initiate random number generator for langevin type dynamics,
 +     * for BD, SD or velocity rescaling temperature coupling.
 +     */
-     gmx_rng_get_state(upd->sd->gaussrand,state->ld_rng,state->ld_rngi);
++    if (ir->eI == eiBD || EI_SD(ir->eI))
++    {
++        sd->ngaussrand = nthreads;
++    }
++    else
++    {
++        sd->ngaussrand = 1;
++    }
++    snew(sd->gaussrand,sd->ngaussrand);
++
++    /* Initialize the first random generator */
++    sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
++
++    if (sd->ngaussrand > 1)
++    {
++        /* Initialize the rest of the random number generators,
++         * using the first one to generate seeds.
++         */
++        init_multiple_gaussrand(sd);
++    }
 +
 +    ngtc = ir->opts.ngtc;
 +
 +    if (ir->eI == eiBD)
 +    {
 +        snew(sd->bd_rf,ngtc);
 +    }
 +    else if (EI_SD(ir->eI))
 +    {
 +        snew(sd->sdc,ngtc);
 +        snew(sd->sdsig,ngtc);
 +
 +        sdc = sd->sdc;
 +        for(n=0; n<ngtc; n++)
 +        {
 +            if (ir->opts.tau_t[n] > 0)
 +            {
 +                sdc[n].gdt = ir->delta_t/ir->opts.tau_t[n];
 +                sdc[n].eph = exp(sdc[n].gdt/2);
 +                sdc[n].emh = exp(-sdc[n].gdt/2);
 +                sdc[n].em  = exp(-sdc[n].gdt);
 +            }
 +            else
 +            {
 +                /* No friction and noise on this group */
 +                sdc[n].gdt = 0;
 +                sdc[n].eph = 1;
 +                sdc[n].emh = 1;
 +                sdc[n].em  = 1;
 +            }
 +            if (sdc[n].gdt >= 0.05)
 +            {
 +                sdc[n].b = sdc[n].gdt*(sdc[n].eph*sdc[n].eph - 1)
 +                    - 4*(sdc[n].eph - 1)*(sdc[n].eph - 1);
 +                sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
 +                sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
 +            }
 +            else
 +            {
 +                y = sdc[n].gdt/2;
 +                /* Seventh order expansions for small y */
 +                sdc[n].b = y*y*y*y*(1/3.0+y*(1/3.0+y*(17/90.0+y*7/9.0)));
 +                sdc[n].c = y*y*y*(2/3.0+y*(-1/2.0+y*(7/30.0+y*(-1/12.0+y*31/1260.0))));
 +                sdc[n].d = y*y*(-1+y*y*(-1/12.0-y*y/360.0));
 +            }
 +            if(debug)
 +                fprintf(debug,"SD const tc-grp %d: b %g  c %g  d %g\n",
 +                        n,sdc[n].b,sdc[n].c,sdc[n].d);
 +        }
 +    }
 +    else if (ETC_ANDERSEN(ir->etc))
 +    {
 +        int ngtc;
 +        t_grpopts *opts;
 +        real reft;
 +
 +        opts = &ir->opts;
 +        ngtc = opts->ngtc;
 +
 +        snew(sd->randomize_group,ngtc);
 +        snew(sd->boltzfac,ngtc);
 +
 +        /* for now, assume that all groups, if randomized, are randomized at the same rate, i.e. tau_t is the same. */
 +        /* since constraint groups don't necessarily match up with temperature groups! This is checked in readir.c */
 +
 +        for (n=0;n<ngtc;n++) {
 +            reft = max(0.0,opts->ref_t[n]);
 +            if ((opts->tau_t[n] > 0) && (reft > 0))  /* tau_t or ref_t = 0 means that no randomization is done */
 +            {
 +                sd->randomize_group[n] = TRUE;
 +                sd->boltzfac[n] = BOLTZ*opts->ref_t[n];
 +            } else {
 +                sd->randomize_group[n] = FALSE;
 +            }
 +        }
 +    }
 +    return sd;
 +}
 +
 +void get_stochd_state(gmx_update_t upd,t_state *state)
 +{
-     gmx_rng_set_state(upd->sd->gaussrand,state->ld_rng,state->ld_rngi[0]);
++    /* Note that we only get the state of the first random generator,
++     * even if there are multiple. This avoids repetition.
++     */
++    gmx_rng_get_state(upd->sd->gaussrand[0],state->ld_rng,state->ld_rngi);
 +}
 +
 +void set_stochd_state(gmx_update_t upd,t_state *state)
 +{
-         upd->sd = init_stochd(fplog,ir);
++    gmx_stochd_t *sd;
++    int i;
++
++    sd = upd->sd;
++
++    gmx_rng_set_state(sd->gaussrand[0],state->ld_rng,state->ld_rngi[0]);
++
++    if (sd->ngaussrand > 1)
++    {
++        /* We only end up here with SD or BD with OpenMP.
++         * Destroy and reinitialize the rest of the random number generators,
++         * using seeds generated from the first one.
++         * Although this doesn't recover the previous state,
++         * it at least avoids repetition, which is most important.
++         * Exaclty restoring states with all MPI+OpenMP setups is difficult
++         * and as the integrator is random to start with, doesn't gain us much.
++         */
++        for(i=1; i<sd->ngaussrand; i++)
++        {
++            gmx_rng_destroy(sd->gaussrand[i]);
++        }
++
++        init_multiple_gaussrand(sd);
++    }
 +}
 +
 +gmx_update_t init_update(FILE *fplog,t_inputrec *ir)
 +{
 +    t_gmx_update *upd;
 +
 +    snew(upd,1);
 +
 +    if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
 +    {
-                           int start,int homenr,double dt,
++        upd->sd = init_stochd(fplog,ir,gmx_omp_nthreads_get(emntUpdate));
 +    }
 +
 +    upd->xp = NULL;
 +    upd->xp_nalloc = 0;
 +    upd->randatom = NULL;
 +    upd->randatom_list = NULL;
 +    upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
 +
 +    return upd;
 +}
 +
 +static void do_update_sd1(gmx_stochd_t *sd,
-   gmx_rng_t gaussrand;
++                          gmx_rng_t gaussrand,
++                          int start,int nrend,double dt,
 +                          rvec accel[],ivec nFreeze[],
 +                          real invmass[],unsigned short ptype[],
 +                          unsigned short cFREEZE[],unsigned short cACC[],
 +                          unsigned short cTC[],
 +                          rvec x[],rvec xprime[],rvec v[],rvec f[],
 +                          rvec sd_X[],
 +                          int ngtc,real tau_t[],real ref_t[])
 +{
 +  gmx_sd_const_t *sdc;
 +  gmx_sd_sigma_t *sig;
-   if (homenr > sd->sd_V_nalloc)
 +  real   kT;
 +  int    gf=0,ga=0,gt=0;
 +  real   ism,sd_V;
 +  int    n,d;
 +
 +  sdc = sd->sdc;
 +  sig = sd->sdsig;
-       sd->sd_V_nalloc = over_alloc_dd(homenr);
++  if (nrend-start > sd->sd_V_nalloc)
 +  {
-   gaussrand = sd->gaussrand;
++      sd->sd_V_nalloc = over_alloc_dd(nrend-start);
 +      srenew(sd->sd_V,sd->sd_V_nalloc);
 +  }
-   for(n=start; n<start+homenr; n++)
 +
 +  for(n=0; n<ngtc; n++)
 +  {
 +      kT = BOLTZ*ref_t[n];
 +      /* The mass is encounted for later, since this differs per atom */
 +      sig[n].V  = sqrt(kT*(1 - sdc[n].em*sdc[n].em));
 +  }
 +
- static void do_update_sd2(gmx_stochd_t *sd,gmx_bool bInitStep,
-                           int start,int homenr,
++  for(n=start; n<nrend; n++)
 +  {
 +      ism = sqrt(invmass[n]);
 +      if (cFREEZE)
 +      {
 +          gf  = cFREEZE[n];
 +      }
 +      if (cACC)
 +      {
 +          ga  = cACC[n];
 +      }
 +      if (cTC)
 +      {
 +          gt  = cTC[n];
 +      }
 +
 +      for(d=0; d<DIM; d++)
 +      {
 +          if((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +          {
 +              sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
 +
 +              v[n][d] = v[n][d]*sdc[gt].em
 +                  + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
 +                  + sd_V;
 +
 +              xprime[n][d] = x[n][d] + v[n][d]*dt;
 +          }
 +          else
 +          {
 +              v[n][d]      = 0.0;
 +              xprime[n][d] = x[n][d];
 +          }
 +      }
 +  }
 +}
 +
-   gmx_rng_t gaussrand;
++static void do_update_sd2(gmx_stochd_t *sd,
++                          gmx_rng_t gaussrand,
++                          gmx_bool bInitStep,
++                          int start,int nrend,
 +                          rvec accel[],ivec nFreeze[],
 +                          real invmass[],unsigned short ptype[],
 +                          unsigned short cFREEZE[],unsigned short cACC[],
 +                          unsigned short cTC[],
 +                          rvec x[],rvec xprime[],rvec v[],rvec f[],
 +                          rvec sd_X[],
 +                          int ngtc,real tau_t[],real ref_t[],
 +                          gmx_bool bFirstHalf)
 +{
 +  gmx_sd_const_t *sdc;
 +  gmx_sd_sigma_t *sig;
 +  /* The random part of the velocity update, generated in the first
 +   * half of the update, needs to be remembered for the second half.
 +   */
 +  rvec *sd_V;
-   if (homenr > sd->sd_V_nalloc)
 +  real   kT;
 +  int    gf=0,ga=0,gt=0;
 +  real   vn=0,Vmh,Xmh;
 +  real   ism;
 +  int    n,d;
 +
 +  sdc = sd->sdc;
 +  sig = sd->sdsig;
-       sd->sd_V_nalloc = over_alloc_dd(homenr);
++  if (nrend-start > sd->sd_V_nalloc)
 +  {
-   gaussrand = sd->gaussrand;
++      sd->sd_V_nalloc = over_alloc_dd(nrend-start);
 +      srenew(sd->sd_V,sd->sd_V_nalloc);
 +  }
 +  sd_V = sd->sd_V;
-   for (n=start; n<start+homenr; n++)
 +
 +  if (bFirstHalf)
 +  {
 +      for (n=0; n<ngtc; n++)
 +      {
 +          kT = BOLTZ*ref_t[n];
 +          /* The mass is encounted for later, since this differs per atom */
 +          sig[n].V  = sqrt(kT*(1-sdc[n].em));
 +          sig[n].X  = sqrt(kT*sqr(tau_t[n])*sdc[n].c);
 +          sig[n].Yv = sqrt(kT*sdc[n].b/sdc[n].c);
 +          sig[n].Yx = sqrt(kT*sqr(tau_t[n])*sdc[n].b/(1-sdc[n].em));
 +      }
 +  }
 +
-                             state->therm_integral,upd->sd->gaussrand);
++  for (n=start; n<nrend; n++)
 +  {
 +      ism = sqrt(invmass[n]);
 +      if (cFREEZE)
 +      {
 +          gf  = cFREEZE[n];
 +      }
 +      if (cACC)
 +      {
 +          ga  = cACC[n];
 +      }
 +      if (cTC)
 +      {
 +          gt  = cTC[n];
 +      }
 +
 +      for(d=0; d<DIM; d++)
 +      {
 +          if (bFirstHalf)
 +          {
 +              vn             = v[n][d];
 +          }
 +          if((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +          {
 +              if (bFirstHalf)
 +              {
 +                  if (bInitStep)
 +                  {
 +                      sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
 +                  }
 +                  Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
 +                      + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
 +                  sd_V[n-start][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
 +
 +                  v[n][d] = vn*sdc[gt].em
 +                      + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
 +                      + sd_V[n-start][d] - sdc[gt].em*Vmh;
 +
 +                  xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
 +              }
 +              else
 +              {
 +
 +                  /* Correct the velocities for the constraints.
 +                   * This operation introduces some inaccuracy,
 +                   * since the velocity is determined from differences in coordinates.
 +                   */
 +                  v[n][d] =
 +                      (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
 +
 +                  Xmh = sd_V[n-start][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
 +                      + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
 +                  sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
 +
 +                  xprime[n][d] += sd_X[n][d] - Xmh;
 +
 +              }
 +          }
 +          else
 +          {
 +              if (bFirstHalf)
 +              {
 +                  v[n][d]        = 0.0;
 +                  xprime[n][d]   = x[n][d];
 +              }
 +          }
 +      }
 +  }
 +}
 +
 +static void do_update_bd(int start,int nrend,double dt,
 +                         ivec nFreeze[],
 +                         real invmass[],unsigned short ptype[],
 +                         unsigned short cFREEZE[],unsigned short cTC[],
 +                         rvec x[],rvec xprime[],rvec v[],
 +                         rvec f[],real friction_coefficient,
 +                         int ngtc,real tau_t[],real ref_t[],
 +                         real *rf,gmx_rng_t gaussrand)
 +{
 +    /* note -- these appear to be full step velocities . . .  */
 +    int    gf=0,gt=0;
 +    real   vn;
 +    real   invfr=0;
 +    int    n,d;
 +
 +    if (friction_coefficient != 0)
 +    {
 +        invfr = 1.0/friction_coefficient;
 +        for(n=0; n<ngtc; n++)
 +        {
 +            rf[n] = sqrt(2.0*BOLTZ*ref_t[n]/(friction_coefficient*dt));
 +        }
 +    }
 +    else
 +    {
 +        for(n=0; n<ngtc; n++)
 +        {
 +            rf[n] = sqrt(2.0*BOLTZ*ref_t[n]);
 +        }
 +    }
 +    for(n=start; (n<nrend); n++)
 +    {
 +        if (cFREEZE)
 +        {
 +            gf = cFREEZE[n];
 +        }
 +        if (cTC)
 +        {
 +            gt = cTC[n];
 +        }
 +        for(d=0; (d<DIM); d++)
 +        {
 +            if((ptype[n]!=eptVSite) && (ptype[n]!=eptShell) && !nFreeze[gf][d])
 +            {
 +                if (friction_coefficient != 0) {
 +                    vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
 +                }
 +                else
 +                {
 +                    /* NOTE: invmass = 2/(mass*friction_constant*dt) */
 +                    vn = 0.5*invmass[n]*f[n][d]*dt 
 +                        + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
 +                }
 +
 +                v[n][d]      = vn;
 +                xprime[n][d] = x[n][d]+vn*dt;
 +            }
 +            else
 +            {
 +                v[n][d]      = 0.0;
 +                xprime[n][d] = x[n][d];
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_it_all(FILE *fp,const char *title,
 +                        int natoms,rvec x[],rvec xp[],rvec v[],rvec f[])
 +{
 +#ifdef DEBUG
 +  if (fp)
 +  {
 +    fprintf(fp,"%s\n",title);
 +    pr_rvecs(fp,0,"x",x,natoms);
 +    pr_rvecs(fp,0,"xp",xp,natoms);
 +    pr_rvecs(fp,0,"v",v,natoms);
 +    pr_rvecs(fp,0,"f",f,natoms);
 +  }
 +#endif
 +}
 +
 +static void calc_ke_part_normal(rvec v[], t_grpopts *opts,t_mdatoms *md,
 +                                gmx_ekindata_t *ekind,t_nrnb *nrnb,gmx_bool bEkinAveVel,
 +                                gmx_bool bSaveEkinOld)
 +{
 +  int          g;
 +  t_grp_tcstat *tcstat=ekind->tcstat;
 +  t_grp_acc    *grpstat=ekind->grpstat;
 +  int          nthread,thread;
 +
 +  /* three main: VV with AveVel, vv with AveEkin, leap with AveEkin.  Leap with AveVel is also
 +     an option, but not supported now.  Additionally, if we are doing iterations.
 +     bEkinAveVel: If TRUE, we sum into ekin, if FALSE, into ekinh.
 +     bSavEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't copy over the ekinh_old.
 +     If FALSE, we overrwrite it.
 +  */
 +
 +  /* group velocities are calculated in update_ekindata and
 +   * accumulated in acumulate_groups.
 +   * Now the partial global and groups ekin.
 +   */
 +  for(g=0; (g<opts->ngtc); g++)
 +  {
 +
 +      if (!bSaveEkinOld) {
 +          copy_mat(tcstat[g].ekinh,tcstat[g].ekinh_old);
 +      }
 +      if(bEkinAveVel) {
 +          clear_mat(tcstat[g].ekinf);
 +      } else {
 +          clear_mat(tcstat[g].ekinh);
 +      }
 +      if (bEkinAveVel) {
 +          tcstat[g].ekinscalef_nhc = 1.0;   /* need to clear this -- logic is complicated! */
 +      }
 +  }
 +  ekind->dekindl_old = ekind->dekindl;
 +  
 +  nthread = gmx_omp_nthreads_get(emntUpdate);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        int  start_t,end_t,n;
 +        int  ga,gt;
 +        rvec v_corrt;
 +        real hm;
 +        int  d,m;
 +        matrix *ekin_sum;
 +        real   *dekindl_sum;
 +
 +        start_t = md->start + ((thread+0)*md->homenr)/nthread;
 +        end_t   = md->start + ((thread+1)*md->homenr)/nthread;
 +
 +        ekin_sum    = ekind->ekin_work[thread];
 +        dekindl_sum = &ekind->ekin_work[thread][opts->ngtc][0][0];
 +
 +        for(gt=0; gt<opts->ngtc; gt++)
 +        {
 +            clear_mat(ekin_sum[gt]);
 +        }
 +
 +        ga = 0;
 +        gt = 0;
 +        for(n=start_t; n<end_t; n++) 
 +        {
 +            if (md->cACC)
 +            {
 +                ga = md->cACC[n];
 +            }
 +            if (md->cTC)
 +            {
 +                gt = md->cTC[n];
 +            }
 +            hm   = 0.5*md->massT[n];
 +            
 +            for(d=0; (d<DIM); d++) 
 +            {
 +                v_corrt[d]  = v[n][d]  - grpstat[ga].u[d];
 +            }
 +            for(d=0; (d<DIM); d++) 
 +            {
 +                for (m=0;(m<DIM); m++) 
 +                {
 +                    /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
 +                    ekin_sum[gt][m][d] += hm*v_corrt[m]*v_corrt[d];
 +                }
 +            }
 +            if (md->nMassPerturbed && md->bPerturbed[n]) 
 +            {
 +                *dekindl_sum -=
 +                    0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt,v_corrt);
 +            }
 +        }
 +    }
 +
 +    ekind->dekindl = 0;
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        for(g=0; g<opts->ngtc; g++)
 +        {
 +            if (bEkinAveVel) 
 +            {
 +                m_add(tcstat[g].ekinf,ekind->ekin_work[thread][g],
 +                      tcstat[g].ekinf);
 +            }
 +            else
 +            {
 +                m_add(tcstat[g].ekinh,ekind->ekin_work[thread][g],
 +                      tcstat[g].ekinh);
 +            }
 +        }
 +
 +        ekind->dekindl += ekind->ekin_work[thread][opts->ngtc][0][0];
 +    }
 +
 +    inc_nrnb(nrnb,eNR_EKIN,md->homenr);
 +}
 +
 +static void calc_ke_part_visc(matrix box,rvec x[],rvec v[],
 +                              t_grpopts *opts,t_mdatoms *md,
 +                              gmx_ekindata_t *ekind,
 +                              t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
 +{
 +  int          start=md->start,homenr=md->homenr;
 +  int          g,d,n,m,gt=0;
 +  rvec         v_corrt;
 +  real         hm;
 +  t_grp_tcstat *tcstat=ekind->tcstat;
 +  t_cos_acc    *cosacc=&(ekind->cosacc);
 +  real         dekindl;
 +  real         fac,cosz;
 +  double       mvcos;
 +
 +  for(g=0; g<opts->ngtc; g++)
 +  {
 +      copy_mat(ekind->tcstat[g].ekinh,ekind->tcstat[g].ekinh_old);
 +      clear_mat(ekind->tcstat[g].ekinh);
 +  }
 +  ekind->dekindl_old = ekind->dekindl;
 +
 +  fac = 2*M_PI/box[ZZ][ZZ];
 +  mvcos = 0;
 +  dekindl = 0;
 +  for(n=start; n<start+homenr; n++)
 +  {
 +      if (md->cTC)
 +      {
 +          gt = md->cTC[n];
 +      }
 +      hm   = 0.5*md->massT[n];
 +
 +      /* Note that the times of x and v differ by half a step */
 +      /* MRS -- would have to be changed for VV */
 +      cosz         = cos(fac*x[n][ZZ]);
 +      /* Calculate the amplitude of the new velocity profile */
 +      mvcos       += 2*cosz*md->massT[n]*v[n][XX];
 +
 +      copy_rvec(v[n],v_corrt);
 +      /* Subtract the profile for the kinetic energy */
 +      v_corrt[XX] -= cosz*cosacc->vcos;
 +      for (d=0; (d<DIM); d++)
 +      {
 +          for (m=0; (m<DIM); m++)
 +          {
 +              /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
 +              if (bEkinAveVel)
 +              {
 +                  tcstat[gt].ekinf[m][d]+=hm*v_corrt[m]*v_corrt[d];
 +              }
 +              else
 +              {
 +                  tcstat[gt].ekinh[m][d]+=hm*v_corrt[m]*v_corrt[d];
 +              }
 +          }
 +      }
 +      if(md->nPerturbed && md->bPerturbed[n])
 +      {
 +          dekindl -= 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt,v_corrt);
 +      }
 +  }
 +  ekind->dekindl = dekindl;
 +  cosacc->mvcos = mvcos;
 +
 +  inc_nrnb(nrnb,eNR_EKIN,homenr);
 +}
 +
 +void calc_ke_part(t_state *state,t_grpopts *opts,t_mdatoms *md,
 +                  gmx_ekindata_t *ekind,t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
 +{
 +    if (ekind->cosacc.cos_accel == 0)
 +    {
 +        calc_ke_part_normal(state->v,opts,md,ekind,nrnb,bEkinAveVel,bSaveEkinOld);
 +    }
 +    else
 +    {
 +        calc_ke_part_visc(state->box,state->x,state->v,opts,md,ekind,nrnb,bEkinAveVel,bSaveEkinOld);
 +    }
 +}
 +
 +extern void init_ekinstate(ekinstate_t *ekinstate,const t_inputrec *ir)
 +{
 +    ekinstate->ekin_n = ir->opts.ngtc;
 +    snew(ekinstate->ekinh,ekinstate->ekin_n);
 +    snew(ekinstate->ekinf,ekinstate->ekin_n);
 +    snew(ekinstate->ekinh_old,ekinstate->ekin_n);
 +    snew(ekinstate->ekinscalef_nhc,ekinstate->ekin_n);
 +    snew(ekinstate->ekinscaleh_nhc,ekinstate->ekin_n);
 +    snew(ekinstate->vscale_nhc,ekinstate->ekin_n);
 +    ekinstate->dekindl = 0;
 +    ekinstate->mvcos   = 0;
 +}
 +
 +void update_ekinstate(ekinstate_t *ekinstate,gmx_ekindata_t *ekind)
 +{
 +  int i;
 +
 +  for(i=0;i<ekinstate->ekin_n;i++)
 +  {
 +      copy_mat(ekind->tcstat[i].ekinh,ekinstate->ekinh[i]);
 +      copy_mat(ekind->tcstat[i].ekinf,ekinstate->ekinf[i]);
 +      copy_mat(ekind->tcstat[i].ekinh_old,ekinstate->ekinh_old[i]);
 +      ekinstate->ekinscalef_nhc[i] = ekind->tcstat[i].ekinscalef_nhc;
 +      ekinstate->ekinscaleh_nhc[i] = ekind->tcstat[i].ekinscaleh_nhc;
 +      ekinstate->vscale_nhc[i] = ekind->tcstat[i].vscale_nhc;
 +  }
 +
 +  copy_mat(ekind->ekin,ekinstate->ekin_total);
 +  ekinstate->dekindl = ekind->dekindl;
 +  ekinstate->mvcos = ekind->cosacc.mvcos;
 +
 +}
 +
 +void restore_ekinstate_from_state(t_commrec *cr,
 +                                  gmx_ekindata_t *ekind,ekinstate_t *ekinstate)
 +{
 +  int i,n;
 +
 +  if (MASTER(cr))
 +  {
 +      for(i=0;i<ekinstate->ekin_n;i++)
 +      {
 +          copy_mat(ekinstate->ekinh[i],ekind->tcstat[i].ekinh);
 +          copy_mat(ekinstate->ekinf[i],ekind->tcstat[i].ekinf);
 +          copy_mat(ekinstate->ekinh_old[i],ekind->tcstat[i].ekinh_old);
 +          ekind->tcstat[i].ekinscalef_nhc = ekinstate->ekinscalef_nhc[i];
 +          ekind->tcstat[i].ekinscaleh_nhc = ekinstate->ekinscaleh_nhc[i];
 +          ekind->tcstat[i].vscale_nhc = ekinstate->vscale_nhc[i];
 +      }
 +
 +      copy_mat(ekinstate->ekin_total,ekind->ekin);
 +
 +      ekind->dekindl = ekinstate->dekindl;
 +      ekind->cosacc.mvcos = ekinstate->mvcos;
 +      n = ekinstate->ekin_n;
 +  }
 +
 +  if (PAR(cr))
 +  {
 +      gmx_bcast(sizeof(n),&n,cr);
 +      for(i=0;i<n;i++)
 +      {
 +          gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh[0][0]),
 +                    ekind->tcstat[i].ekinh[0],cr);
 +          gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinf[0][0]),
 +                    ekind->tcstat[i].ekinf[0],cr);
 +          gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh_old[0][0]),
 +                    ekind->tcstat[i].ekinh_old[0],cr);
 +
 +          gmx_bcast(sizeof(ekind->tcstat[i].ekinscalef_nhc),
 +                    &(ekind->tcstat[i].ekinscalef_nhc),cr);
 +          gmx_bcast(sizeof(ekind->tcstat[i].ekinscaleh_nhc),
 +                    &(ekind->tcstat[i].ekinscaleh_nhc),cr);
 +          gmx_bcast(sizeof(ekind->tcstat[i].vscale_nhc),
 +                    &(ekind->tcstat[i].vscale_nhc),cr);
 +      }
 +      gmx_bcast(DIM*DIM*sizeof(ekind->ekin[0][0]),
 +                ekind->ekin[0],cr);
 +
 +      gmx_bcast(sizeof(ekind->dekindl),&ekind->dekindl,cr);
 +      gmx_bcast(sizeof(ekind->cosacc.mvcos),&ekind->cosacc.mvcos,cr);
 +  }
 +}
 +
 +void set_deform_reference_box(gmx_update_t upd,gmx_large_int_t step,matrix box)
 +{
 +    upd->deformref_step = step;
 +    copy_mat(box,upd->deformref_box);
 +}
 +
 +static void deform(gmx_update_t upd,
 +                   int start,int homenr,rvec x[],matrix box,matrix *scale_tot,
 +                   const t_inputrec *ir,gmx_large_int_t step)
 +{
 +    matrix bnew,invbox,mu;
 +    real   elapsed_time;
 +    int    i,j;
 +
 +    elapsed_time = (step + 1 - upd->deformref_step)*ir->delta_t;
 +    copy_mat(box,bnew);
 +    for(i=0; i<DIM; i++)
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            if (ir->deform[i][j] != 0)
 +            {
 +                bnew[i][j] =
 +                    upd->deformref_box[i][j] + elapsed_time*ir->deform[i][j];
 +            }
 +        }
 +    }
 +    /* We correct the off-diagonal elements,
 +     * which can grow indefinitely during shearing,
 +     * so the shifts do not get messed up.
 +     */
 +    for(i=1; i<DIM; i++)
 +    {
 +        for(j=i-1; j>=0; j--)
 +        {
 +            while (bnew[i][j] - box[i][j] > 0.5*bnew[j][j])
 +            {
 +                rvec_dec(bnew[i],bnew[j]);
 +            }
 +            while (bnew[i][j] - box[i][j] < -0.5*bnew[j][j])
 +            {
 +                rvec_inc(bnew[i],bnew[j]);
 +            }
 +        }
 +    }
 +    m_inv_ur0(box,invbox);
 +    copy_mat(bnew,box);
 +    mmul_ur0(box,invbox,mu);
 +
 +    for(i=start; i<start+homenr; i++)
 +    {
 +        x[i][XX] = mu[XX][XX]*x[i][XX]+mu[YY][XX]*x[i][YY]+mu[ZZ][XX]*x[i][ZZ];
 +        x[i][YY] = mu[YY][YY]*x[i][YY]+mu[ZZ][YY]*x[i][ZZ];
 +        x[i][ZZ] = mu[ZZ][ZZ]*x[i][ZZ];
 +    }
 +    if (*scale_tot)
 +    {
 +        /* The transposes of the scaling matrices are stored,
 +         * so we need to do matrix multiplication in the inverse order.
 +         */
 +        mmul_ur0(*scale_tot,mu,*scale_tot);
 +    }
 +}
 +
 +static void combine_forces(int nstcalclr,
 +                           gmx_constr_t constr,
 +                           t_inputrec *ir,t_mdatoms *md,t_idef *idef,
 +                           t_commrec *cr,
 +                           gmx_large_int_t step,
 +                           t_state *state,gmx_bool bMolPBC,
 +                           int start,int nrend,
 +                           rvec f[],rvec f_lr[],
 +                           t_nrnb *nrnb)
 +{
 +    int  i,d,nm1;
 +
 +    /* f contains the short-range forces + the long range forces
 +     * which are stored separately in f_lr.
 +     */
 +
 +    if (constr != NULL && !(ir->eConstrAlg == econtSHAKE && ir->epc == epcNO))
 +    {
 +        /* We need to constrain the LR forces separately,
 +         * because due to the different pre-factor for the SR and LR
 +         * forces in the update algorithm, we can not determine
 +         * the constraint force for the coordinate constraining.
 +         * Constrain only the additional LR part of the force.
 +         */
 +        /* MRS -- need to make sure this works with trotter integration -- the constraint calls may not be right.*/
 +        constrain(NULL,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +                  state->x,f_lr,f_lr,bMolPBC,state->box,state->lambda[efptBONDED],NULL,
 +                  NULL,NULL,nrnb,econqForce,ir->epc==epcMTTK,state->veta,state->veta);
 +    }
 +
 +    /* Add nstcalclr-1 times the LR force to the sum of both forces
 +     * and store the result in forces_lr.
 +     */
 +    nm1 = nstcalclr - 1;
 +    for(i=start; i<nrend; i++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            f_lr[i][d] = f[i][d] + nm1*f_lr[i][d];
 +        }
 +    }
 +}
 +
 +void update_tcouple(FILE         *fplog,
 +                    gmx_large_int_t   step,
 +                    t_inputrec   *inputrec,
 +                    t_state      *state,
 +                    gmx_ekindata_t *ekind,
 +                    gmx_wallcycle_t wcycle,
 +                    gmx_update_t upd,
 +                    t_extmass    *MassQ,
 +                    t_mdatoms  *md)
 +
 +{
 +    gmx_bool   bTCouple=FALSE;
 +    real   dttc;
 +    int    i,start,end,homenr,offset;
 +
 +    /* if using vv with trotter decomposition methods, we do this elsewhere in the code */
 +    if (inputrec->etc != etcNO &&
 +        !(IR_NVT_TROTTER(inputrec) || IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec)))
 +    {
 +        /* We should only couple after a step where energies were determined (for leapfrog versions)
 +         or the step energies are determined, for velocity verlet versions */
 +
 +        if (EI_VV(inputrec->eI)) {
 +            offset = 0;
 +        } else {
 +            offset = 1;
 +        }
 +        bTCouple = (inputrec->nsttcouple == 1 ||
 +                    do_per_step(step+inputrec->nsttcouple-offset,
 +                                inputrec->nsttcouple));
 +    }
 +
 +    if (bTCouple)
 +    {
 +        dttc = inputrec->nsttcouple*inputrec->delta_t;
 +
 +        switch (inputrec->etc)
 +        {
 +        case etcNO:
 +            break;
 +        case etcBERENDSEN:
 +            berendsen_tcoupl(inputrec,ekind,dttc);
 +            break;
 +        case etcNOSEHOOVER:
 +            nosehoover_tcoupl(&(inputrec->opts),ekind,dttc,
 +                              state->nosehoover_xi,state->nosehoover_vxi,MassQ);
 +            break;
 +        case etcVRESCALE:
 +            vrescale_tcoupl(inputrec,ekind,dttc,
-         /* The second part of the SD integration */
-         do_update_sd2(upd->sd,FALSE,start,homenr,
-                       inputrec->opts.acc,inputrec->opts.nFreeze,
-                       md->invmass,md->ptype,
-                       md->cFREEZE,md->cACC,md->cTC,
-                       state->x,xprime,state->v,force,state->sd_X,
-                       inputrec->opts.ngtc,inputrec->opts.tau_t,
-                       inputrec->opts.ref_t,FALSE);
++                            state->therm_integral,upd->sd->gaussrand[0]);
 +            break;
 +        }
 +        /* rescale in place here */
 +        if (EI_VV(inputrec->eI))
 +        {
 +            rescale_velocities(ekind,md,md->start,md->start+md->homenr,state->v);
 +        }
 +    }
 +    else
 +    {
 +        /* Set the T scaling lambda to 1 to have no scaling */
 +        for(i=0; (i<inputrec->opts.ngtc); i++)
 +        {
 +            ekind->tcstat[i].lambda = 1.0;
 +        }
 +    }
 +}
 +
 +void update_pcouple(FILE         *fplog,
 +                    gmx_large_int_t   step,
 +                    t_inputrec   *inputrec,
 +                    t_state      *state,
 +                    matrix       pcoupl_mu,
 +                    matrix       M,
 +                    gmx_wallcycle_t wcycle,
 +                    gmx_update_t upd,
 +                    gmx_bool         bInitStep)
 +{
 +    gmx_bool   bPCouple=FALSE;
 +    real   dtpc=0;
 +    int    i;
 +
 +    /* if using Trotter pressure, we do this in coupling.c, so we leave it false. */
 +    if (inputrec->epc != epcNO && (!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))))
 +    {
 +        /* We should only couple after a step where energies were determined */
 +        bPCouple = (inputrec->nstpcouple == 1 ||
 +                    do_per_step(step+inputrec->nstpcouple-1,
 +                                inputrec->nstpcouple));
 +    }
 +
 +    clear_mat(pcoupl_mu);
 +    for(i=0; i<DIM; i++)
 +    {
 +        pcoupl_mu[i][i] = 1.0;
 +    }
 +
 +    clear_mat(M);
 +
 +    if (bPCouple)
 +    {
 +        dtpc = inputrec->nstpcouple*inputrec->delta_t;
 +
 +        switch (inputrec->epc)
 +        {
 +            /* We can always pcoupl, even if we did not sum the energies
 +             * the previous step, since state->pres_prev is only updated
 +             * when the energies have been summed.
 +             */
 +        case (epcNO):
 +            break;
 +        case (epcBERENDSEN):
 +            if (!bInitStep)
 +            {
 +                berendsen_pcoupl(fplog,step,inputrec,dtpc,state->pres_prev,state->box,
 +                                 pcoupl_mu);
 +            }
 +            break;
 +        case (epcPARRINELLORAHMAN):
 +            parrinellorahman_pcoupl(fplog,step,inputrec,dtpc,state->pres_prev,
 +                                    state->box,state->box_rel,state->boxv,
 +                                    M,pcoupl_mu,bInitStep);
 +            break;
 +        default:
 +            break;
 +        }
 +    }
 +}
 +
 +static rvec *get_xprime(const t_state *state,gmx_update_t upd)
 +{
 +    if (state->nalloc > upd->xp_nalloc)
 +    {
 +        upd->xp_nalloc = state->nalloc;
 +        srenew(upd->xp,upd->xp_nalloc);
 +    }
 +
 +    return upd->xp;
 +}
 +
 +void update_constraints(FILE         *fplog,
 +                        gmx_large_int_t   step,
 +                        real         *dvdlambda,    /* the contribution to be added to the bonded interactions */
 +                        t_inputrec   *inputrec,      /* input record and box stuff    */
 +                        gmx_ekindata_t *ekind,
 +                        t_mdatoms    *md,
 +                        t_state      *state,
 +                        gmx_bool     bMolPBC,
 +                        t_graph      *graph,
 +                        rvec         force[],        /* forces on home particles */
 +                        t_idef       *idef,
 +                        tensor       vir_part,
 +                        tensor       vir,            /* tensors for virial and ekin, needed for computing */
 +                        t_commrec    *cr,
 +                        t_nrnb       *nrnb,
 +                        gmx_wallcycle_t wcycle,
 +                        gmx_update_t upd,
 +                        gmx_constr_t constr,
 +                        gmx_bool         bInitStep,
 +                        gmx_bool         bFirstHalf,
 +                        gmx_bool         bCalcVir,
 +                        real         vetanew)
 +{
 +    gmx_bool             bExtended,bLastStep,bLog=FALSE,bEner=FALSE,bDoConstr=FALSE;
 +    double           dt;
 +    real             dt_1;
 +    int              start,homenr,nrend,i,n,m,g,d;
 +    tensor           vir_con;
 +    rvec             *vbuf,*xprime=NULL;
++    int              nth,th;
 +
 +    if (constr) {bDoConstr=TRUE;}
 +    if (bFirstHalf && !EI_VV(inputrec->eI)) {bDoConstr=FALSE;}
 +
 +    /* for now, SD update is here -- though it really seems like it
 +       should be reformulated as a velocity verlet method, since it has two parts */
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend = start+homenr;
 +
 +    dt   = inputrec->delta_t;
 +    dt_1 = 1.0/dt;
 +
 +    /*
 +     *  Steps (7C, 8C)
 +     *  APPLY CONSTRAINTS:
 +     *  BLOCK SHAKE
 +
 +     * When doing PR pressure coupling we have to constrain the
 +     * bonds in each iteration. If we are only using Nose-Hoover tcoupling
 +     * it is enough to do this once though, since the relative velocities
 +     * after this will be normal to the bond vector
 +     */
 +
 +    if (bDoConstr)
 +    {
 +        /* clear out constraints before applying */
 +        clear_mat(vir_part);
 +
 +        xprime = get_xprime(state,upd);
 +
 +        bLastStep = (step == inputrec->init_step+inputrec->nsteps);
 +        bLog  = (do_per_step(step,inputrec->nstlog) || bLastStep || (step < 0));
 +        bEner = (do_per_step(step,inputrec->nstenergy) || bLastStep);
 +        /* Constrain the coordinates xprime */
 +        wallcycle_start(wcycle,ewcCONSTR);
 +        if (EI_VV(inputrec->eI) && bFirstHalf)
 +        {
 +            constrain(NULL,bLog,bEner,constr,idef,
 +                      inputrec,ekind,cr,step,1,md,
 +                      state->x,state->v,state->v,
 +                      bMolPBC,state->box,
 +                      state->lambda[efptBONDED],dvdlambda,
 +                      NULL,bCalcVir ? &vir_con : NULL,nrnb,econqVeloc,
 +                      inputrec->epc==epcMTTK,state->veta,vetanew);
 +        }
 +        else
 +        {
 +            constrain(NULL,bLog,bEner,constr,idef,
 +                      inputrec,ekind,cr,step,1,md,
 +                      state->x,xprime,NULL,
 +                      bMolPBC,state->box,
 +                      state->lambda[efptBONDED],dvdlambda,
 +                      state->v,bCalcVir ? &vir_con : NULL ,nrnb,econqCoord,
 +                      inputrec->epc==epcMTTK,state->veta,state->veta);
 +        }
 +        wallcycle_stop(wcycle,ewcCONSTR);
 +
 +        where();
 +
 +        dump_it_all(fplog,"After Shake",
 +                    state->natoms,state->x,xprime,state->v,force);
 +
 +        if (bCalcVir)
 +        {
 +            if (inputrec->eI == eiSD2)
 +            {
 +                /* A correction factor eph is needed for the SD constraint force */
 +                /* Here we can, unfortunately, not have proper corrections
 +                 * for different friction constants, so we use the first one.
 +                 */
 +                for(i=0; i<DIM; i++)
 +                {
 +                    for(m=0; m<DIM; m++)
 +                    {
 +                        vir_part[i][m] += upd->sd->sdc[0].eph*vir_con[i][m];
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                m_add(vir_part,vir_con,vir_part);
 +            }
 +            if (debug)
 +            {
 +                pr_rvecs(debug,0,"constraint virial",vir_part,DIM);
 +            }
 +        }
 +    }
 +
 +    where();
 +    if ((inputrec->eI == eiSD2) && !(bFirstHalf))
 +    {
 +        xprime = get_xprime(state,upd);
 +
- # pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
++        nth = gmx_omp_nthreads_get(emntUpdate);
++
++#pragma omp parallel for num_threads(nth) schedule(static)
++        for(th=0; th<nth; th++)
++        {
++            int start_th,end_th;
++
++            start_th = start + ((nrend-start)* th   )/nth;
++            end_th   = start + ((nrend-start)*(th+1))/nth;
++
++            /* The second part of the SD integration */
++            do_update_sd2(upd->sd,upd->sd->gaussrand[th],
++                          FALSE,start_th,end_th,
++                          inputrec->opts.acc,inputrec->opts.nFreeze,
++                          md->invmass,md->ptype,
++                          md->cFREEZE,md->cACC,md->cTC,
++                          state->x,xprime,state->v,force,state->sd_X,
++                          inputrec->opts.ngtc,inputrec->opts.tau_t,
++                          inputrec->opts.ref_t,FALSE);
++        }
 +        inc_nrnb(nrnb, eNR_UPDATE, homenr);
 +
 +        if (bDoConstr)
 +        {
 +            /* Constrain the coordinates xprime */
 +            wallcycle_start(wcycle,ewcCONSTR);
 +            constrain(NULL,bLog,bEner,constr,idef,
 +                      inputrec,NULL,cr,step,1,md,
 +                      state->x,xprime,NULL,
 +                      bMolPBC,state->box,
 +                      state->lambda[efptBONDED],dvdlambda,
 +                      NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +            wallcycle_stop(wcycle,ewcCONSTR);
 +        }
 +    }
 +
 +    /* We must always unshift after updating coordinates; if we did not shake
 +       x was shifted in do_force */
 +
 +    if (!(bFirstHalf)) /* in the first half of vv, no shift. */
 +    {
 +        if (graph && (graph->nnodes > 0))
 +        {
 +            unshift_x(graph,state->box,state->x,upd->xp);
 +            if (TRICLINIC(state->box))
 +            {
 +                inc_nrnb(nrnb,eNR_SHIFTX,2*graph->nnodes);
 +            }
 +            else
 +            {
 +                inc_nrnb(nrnb,eNR_SHIFTX,graph->nnodes);
 +            }
 +        }
 +        else
 +        {
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
 +            for(i=start; i<nrend; i++)
 +            {
 +                copy_rvec(upd->xp[i],state->x[i]);
 +            }
 +        }
 +
 +        dump_it_all(fplog,"After unshift",
 +                    state->natoms,state->x,upd->xp,state->v,force);
 +    }
 +/* ############# END the update of velocities and positions ######### */
 +}
 +
 +void update_box(FILE         *fplog,
 +                gmx_large_int_t   step,
 +                t_inputrec   *inputrec,      /* input record and box stuff    */
 +                t_mdatoms    *md,
 +                t_state      *state,
 +                t_graph      *graph,
 +                rvec         force[],        /* forces on home particles */
 +                matrix       *scale_tot,
 +                matrix       pcoupl_mu,
 +                t_nrnb       *nrnb,
 +                gmx_wallcycle_t wcycle,
 +                gmx_update_t upd,
 +                gmx_bool         bInitStep,
 +                gmx_bool         bFirstHalf)
 +{
 +    gmx_bool             bExtended,bLastStep,bLog=FALSE,bEner=FALSE;
 +    double           dt;
 +    real             dt_1;
 +    int              start,homenr,nrend,i,n,m,g;
 +    tensor           vir_con;
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend = start+homenr;
 +
 +    bExtended =
 +        (inputrec->etc == etcNOSEHOOVER) ||
 +        (inputrec->epc == epcPARRINELLORAHMAN) ||
 +        (inputrec->epc == epcMTTK);
 +
 +    dt = inputrec->delta_t;
 +
 +    where();
 +
 +    /* now update boxes */
 +    switch (inputrec->epc) {
 +    case (epcNO):
 +        break;
 +    case (epcBERENDSEN):
 +        berendsen_pscale(inputrec,pcoupl_mu,state->box,state->box_rel,
 +                         start,homenr,state->x,md->cFREEZE,nrnb);
 +        break;
 +    case (epcPARRINELLORAHMAN):
 +        /* The box velocities were updated in do_pr_pcoupl in the update
 +         * iteration, but we dont change the box vectors until we get here
 +         * since we need to be able to shift/unshift above.
 +         */
 +        for(i=0;i<DIM;i++)
 +        {
 +            for(m=0;m<=i;m++)
 +            {
 +                state->box[i][m] += dt*state->boxv[i][m];
 +            }
 +        }
 +        preserve_box_shape(inputrec,state->box_rel,state->box);
 +
 +        /* Scale the coordinates */
 +        for(n=start; (n<start+homenr); n++)
 +        {
 +            tmvmul_ur0(pcoupl_mu,state->x[n],state->x[n]);
 +        }
 +        break;
 +    case (epcMTTK):
 +        switch (inputrec->epct)
 +        {
 +        case (epctISOTROPIC):
 +            /* DIM * eta = ln V.  so DIM*eta_new = DIM*eta_old + DIM*dt*veta =>
 +               ln V_new = ln V_old + 3*dt*veta => V_new = V_old*exp(3*dt*veta) =>
 +               Side length scales as exp(veta*dt) */
 +
 +            msmul(state->box,exp(state->veta*dt),state->box);
 +
 +            /* Relate veta to boxv.  veta = d(eta)/dT = (1/DIM)*1/V dV/dT.
 +o               If we assume isotropic scaling, and box length scaling
 +               factor L, then V = L^DIM (det(M)).  So dV/dt = DIM
 +               L^(DIM-1) dL/dt det(M), and veta = (1/L) dL/dt.  The
 +               determinant of B is L^DIM det(M), and the determinant
 +               of dB/dt is (dL/dT)^DIM det (M).  veta will be
 +               (det(dB/dT)/det(B))^(1/3).  Then since M =
 +               B_new*(vol_new)^(1/3), dB/dT_new = (veta_new)*B(new). */
 +
 +            msmul(state->box,state->veta,state->boxv);
 +            break;
 +        default:
 +            break;
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +
 +    if ((!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))) && scale_tot)
 +    {
 +        /* The transposes of the scaling matrices are stored,
 +         * therefore we need to reverse the order in the multiplication.
 +         */
 +        mmul_ur0(*scale_tot,pcoupl_mu,*scale_tot);
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        deform(upd,start,homenr,state->x,state->box,scale_tot,inputrec,step);
 +    }
 +    where();
 +    dump_it_all(fplog,"After update",
 +                state->natoms,state->x,upd->xp,state->v,force);
 +}
 +
 +void update_coords(FILE         *fplog,
 +                   gmx_large_int_t   step,
 +                   t_inputrec   *inputrec,      /* input record and box stuff */
 +                   t_mdatoms    *md,
 +                   t_state      *state,
 +                   gmx_bool     bMolPBC,
 +                   rvec         *f,        /* forces on home particles */
 +                   gmx_bool         bDoLR,
 +                   rvec         *f_lr,
 +                   t_fcdata     *fcd,
 +                   gmx_ekindata_t *ekind,
 +                   matrix       M,
 +                   gmx_wallcycle_t wcycle,
 +                   gmx_update_t upd,
 +                   gmx_bool         bInitStep,
 +                   int          UpdatePart,
 +                   t_commrec    *cr,  /* these shouldn't be here -- need to think about it */
 +                   t_nrnb       *nrnb,
 +                   gmx_constr_t constr,
 +                   t_idef       *idef)
 +{
 +    gmx_bool         bNH,bPR,bLastStep,bLog=FALSE,bEner=FALSE;
 +    double           dt,alpha;
 +    real             *imass,*imassin;
 +    rvec             *force;
 +    real             dt_1;
 +    int              start,homenr,nrend,i,j,d,n,m,g;
 +    int              blen0,blen1,iatom,jatom,nshake,nsettle,nconstr,nexpand;
 +    int              *icom = NULL;
 +    tensor           vir_con;
 +    rvec             *vcom,*xcom,*vall,*xall,*xin,*vin,*forcein,*fall,*xpall,*xprimein,*xprime;
 +    int              nth,th;
 +    
 +    /* Running the velocity half does nothing except for velocity verlet */
 +    if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
 +        !EI_VV(inputrec->eI))
 +    {
 +        gmx_incons("update_coords called for velocity without VV integrator");
 +    }
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend = start+homenr;
 +
 +    xprime = get_xprime(state,upd);
 +
 +    dt   = inputrec->delta_t;
 +    dt_1 = 1.0/dt;
 +
 +    /* We need to update the NMR restraint history when time averaging is used */
 +    if (state->flags & (1<<estDISRE_RM3TAV))
 +    {
 +        update_disres_history(fcd,&state->hist);
 +    }
 +    if (state->flags & (1<<estORIRE_DTAV))
 +    {
 +        update_orires_history(fcd,&state->hist);
 +    }
 +
 +
 +    bNH = inputrec->etc == etcNOSEHOOVER;
 +    bPR = ((inputrec->epc == epcPARRINELLORAHMAN) || (inputrec->epc == epcMTTK));
 +
 +    if (bDoLR && inputrec->nstcalclr > 1 && !EI_VV(inputrec->eI))  /* get this working with VV? */
 +    {
 +        /* Store the total force + nstcalclr-1 times the LR force
 +         * in forces_lr, so it can be used in a normal update algorithm
 +         * to produce twin time stepping.
 +         */
 +        /* is this correct in the new construction? MRS */
 +        combine_forces(inputrec->nstcalclr,constr,inputrec,md,idef,cr,
 +                       step,state,bMolPBC,
 +                       start,nrend,f,f_lr,nrnb);
 +        force = f_lr;
 +    }
 +    else
 +    {
 +        force = f;
 +    }
 +
 +    /* ############# START The update of velocities and positions ######### */
 +    where();
 +    dump_it_all(fplog,"Before update",
 +                state->natoms,state->x,xprime,state->v,force);
 +
 +    if (EI_RANDOM(inputrec->eI))
 +    {
 +        /* We still need to take care of generating random seeds properly
 +         * when multi-threading.
 +         */
 +        nth = 1;
 +    }
 +    else
 +    {
 +        nth = gmx_omp_nthreads_get(emntUpdate);
 +    }
 +
-             do_update_sd1(upd->sd,start,homenr,dt,
++#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
 +    for(th=0; th<nth; th++)
 +    {
 +        int start_th,end_th;
 +
 +        start_th = start + ((nrend-start)* th   )/nth;
 +        end_th   = start + ((nrend-start)*(th+1))/nth;
 +
 +        switch (inputrec->eI) {
 +        case (eiMD):
 +            if (ekind->cosacc.cos_accel == 0)
 +            {
 +                do_update_md(start_th,end_th,dt,
 +                             ekind->tcstat,state->nosehoover_vxi,
 +                             ekind->bNEMD,ekind->grpstat,inputrec->opts.acc,
 +                             inputrec->opts.nFreeze,
 +                             md->invmass,md->ptype,
 +                             md->cFREEZE,md->cACC,md->cTC,
 +                             state->x,xprime,state->v,force,M,
 +                             bNH,bPR);
 +            } 
 +            else 
 +            {
 +                do_update_visc(start_th,end_th,dt,
 +                               ekind->tcstat,state->nosehoover_vxi,
 +                               md->invmass,md->ptype,
 +                               md->cTC,state->x,xprime,state->v,force,M,
 +                               state->box,
 +                               ekind->cosacc.cos_accel,
 +                               ekind->cosacc.vcos,
 +                               bNH,bPR);
 +            }
 +            break;
 +        case (eiSD1):
-             do_update_sd2(upd->sd,bInitStep,start,homenr,
++            do_update_sd1(upd->sd,upd->sd->gaussrand[th],
++                          start_th,end_th,dt,
 +                          inputrec->opts.acc,inputrec->opts.nFreeze,
 +                          md->invmass,md->ptype,
 +                          md->cFREEZE,md->cACC,md->cTC,
 +                          state->x,xprime,state->v,force,state->sd_X,
 +                          inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t);
 +            break;
 +        case (eiSD2):
 +            /* The SD update is done in 2 parts, because an extra constraint step
 +             * is needed 
 +             */
-             do_update_bd(start,nrend,dt,
++            do_update_sd2(upd->sd,upd->sd->gaussrand[th],
++                          bInitStep,start_th,end_th,
 +                          inputrec->opts.acc,inputrec->opts.nFreeze,
 +                          md->invmass,md->ptype,
 +                          md->cFREEZE,md->cACC,md->cTC,
 +                          state->x,xprime,state->v,force,state->sd_X,
 +                          inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
 +                          TRUE);
 +        break;
 +        case (eiBD):
-                          upd->sd->bd_rf,upd->sd->gaussrand);
++            do_update_bd(start_th,end_th,dt,
 +                         inputrec->opts.nFreeze,md->invmass,md->ptype,
 +                         md->cFREEZE,md->cTC,
 +                         state->x,xprime,state->v,force,
 +                         inputrec->bd_fric,
 +                         inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
-         andersen_tcoupl(ir,md,state,upd->sd->gaussrand,rate,
++                         upd->sd->bd_rf,upd->sd->gaussrand[th]);
 +            break;
 +        case (eiVV):
 +        case (eiVVAK):
 +            alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
 +            switch (UpdatePart) {
 +            case etrtVELOCITY1:
 +            case etrtVELOCITY2:
 +                do_update_vv_vel(start_th,end_th,dt,
 +                                 ekind->tcstat,ekind->grpstat,
 +                                 inputrec->opts.acc,inputrec->opts.nFreeze,
 +                                 md->invmass,md->ptype,
 +                                 md->cFREEZE,md->cACC,
 +                                 state->v,force,
 +                                 (bNH || bPR),state->veta,alpha);  
 +                break;
 +            case etrtPOSITION:
 +                do_update_vv_pos(start_th,end_th,dt,
 +                                 ekind->tcstat,ekind->grpstat,
 +                                 inputrec->opts.acc,inputrec->opts.nFreeze,
 +                                 md->invmass,md->ptype,md->cFREEZE,
 +                                 state->x,xprime,state->v,force,
 +                                 (bNH || bPR),state->veta,alpha);
 +                break;
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Don't know how to update coordinates");
 +            break;
 +        }
 +    }
 +
 +}
 +
 +
 +void correct_ekin(FILE *log,int start,int end,rvec v[],rvec vcm,real mass[],
 +                  real tmass,tensor ekin)
 +{
 +  /*
 +   * This is a debugging routine. It should not be called for production code
 +   *
 +   * The kinetic energy should calculated according to:
 +   *   Ekin = 1/2 m (v-vcm)^2
 +   * However the correction is not always applied, since vcm may not be
 +   * known in time and we compute
 +   *   Ekin' = 1/2 m v^2 instead
 +   * This can be corrected afterwards by computing
 +   *   Ekin = Ekin' + 1/2 m ( -2 v vcm + vcm^2)
 +   * or in hsorthand:
 +   *   Ekin = Ekin' - m v vcm + 1/2 m vcm^2
 +   */
 +  int    i,j,k;
 +  real   m,tm;
 +  rvec   hvcm,mv;
 +  tensor dekin;
 +
 +  /* Local particles */
 +  clear_rvec(mv);
 +
 +  /* Processor dependent part. */
 +  tm = 0;
 +  for(i=start; (i<end); i++)
 +  {
 +    m      = mass[i];
 +    tm    += m;
 +    for(j=0; (j<DIM); j++)
 +    {
 +        mv[j] += m*v[i][j];
 +    }
 +  }
 +  /* Shortcut */
 +  svmul(1/tmass,vcm,vcm);
 +  svmul(0.5,vcm,hvcm);
 +  clear_mat(dekin);
 +  for(j=0; (j<DIM); j++)
 +  {
 +      for(k=0; (k<DIM); k++)
 +      {
 +          dekin[j][k] += vcm[k]*(tm*hvcm[j]-mv[j]);
 +      }
 +  }
 +  pr_rvecs(log,0,"dekin",dekin,DIM);
 +  pr_rvecs(log,0," ekin", ekin,DIM);
 +  fprintf(log,"dekin = %g, ekin = %g  vcm = (%8.4f %8.4f %8.4f)\n",
 +          trace(dekin),trace(ekin),vcm[XX],vcm[YY],vcm[ZZ]);
 +  fprintf(log,"mv = (%8.4f %8.4f %8.4f)\n",
 +          mv[XX],mv[YY],mv[ZZ]);
 +}
 +
 +extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_large_int_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr) {
 +
 +    int i;
 +    real rate = (ir->delta_t)/ir->opts.tau_t[0];
 +    /* proceed with andersen if 1) it's fixed probability per
 +       particle andersen or 2) it's massive andersen and it's tau_t/dt */
 +    if ((ir->etc==etcANDERSEN) || do_per_step(step,(int)(1.0/rate)))
 +    {
 +        srenew(upd->randatom,state->nalloc);
 +        srenew(upd->randatom_list,state->nalloc);
 +        if (upd->randatom_list_init == FALSE) {
 +            for (i=0;i<state->nalloc;i++) {
 +                upd->randatom[i] = FALSE;
 +                upd->randatom_list[i] = 0;
 +            }
 +            upd->randatom_list_init = TRUE;
 +        }
++        andersen_tcoupl(ir,md,state,upd->sd->gaussrand[0],rate,
 +                        (ir->etc==etcANDERSEN)?idef:NULL,
 +                        constr?get_nblocks(constr):0,
 +                        constr?get_sblock(constr):NULL,
 +                        upd->randatom,upd->randatom_list,
 +                        upd->sd->randomize_group,upd->sd->boltzfac);
 +        return TRUE;
 +    }
 +    return FALSE;
 +}
index 3bcf86ad403184094d5701d153151ae37e4bfffc,0000000000000000000000000000000000000000..4e90cf14b17d8bf354c57a38dcd831e2edbe9dca
mode 100644,000000..100644
--- /dev/null
@@@ -1,1271 -1,0 +1,1273 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <stdio.h>
 +#include <string.h>
 +#include "main.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "statutil.h"
 +#include "sysstuff.h"
 +#include "txtdump.h"
 +#include "gmx_fatal.h"
 +#include "names.h"
 +#include "tpxio.h"
 +#include "enxio.h"
 +#include "mtop_util.h"
 +#include "string2.h"
 +
 +static void cmp_int(FILE *fp,const char *s,int index,int i1,int i2)
 +{
 +  if (i1 != i2) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%d] (%d - %d)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%d - %d)\n",s,i1,i2);
 +  }
 +}
 +
 +static void cmp_gmx_large_int(FILE *fp,const char *s,gmx_large_int_t i1,gmx_large_int_t i2)
 +{
 +  if (i1 != i2) {
 +    fprintf(fp,"%s (",s);
 +    fprintf(fp,gmx_large_int_pfmt,i1);
 +    fprintf(fp," - ");
 +    fprintf(fp,gmx_large_int_pfmt,i2);
 +    fprintf(fp,")\n");
 +  }
 +}
 +
 +static void cmp_us(FILE *fp,const char *s,int index,unsigned short i1,unsigned short i2)
 +{
 +  if (i1 != i2) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%d] (%d - %d)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%d - %d)\n",s,i1,i2);
 +  }
 +}
 +
 +static void cmp_uc(FILE *fp,const char *s,int index,unsigned char i1,unsigned char i2)
 +{
 +  if (i1 != i2) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%d] (%d - %d)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%d - %d)\n",s,i1,i2);
 +  }
 +}
 +
 +static gmx_bool cmp_bool(FILE *fp, const char *s, int index, gmx_bool b1, gmx_bool b2)
 +{
 +  if (b1) {
 +    b1 = 1;
 +  } else {
 +    b1 = 0;
 +  }
 +  if (b2) {
 +    b2 = 1;
 +  } else {
 +    b2 = 0;
 +  }
 +  if (b1 != b2) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%d] (%s - %s)\n",s,index,
 +            bool_names[b1],bool_names[b2]);
 +    else
 +      fprintf(fp,"%s (%s - %s)\n",s,
 +            bool_names[b1],bool_names[b2]);
 +  }
 +  return b1 && b2;
 +}
 +
 +static void cmp_str(FILE *fp, const char *s, int index,
 +                  const char *s1, const char *s2)
 +{
 +  if (strcmp(s1,s2) != 0) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%d] (%s - %s)\n",s,index,s1,s2);
 +    else
 +      fprintf(fp,"%s (%s - %s)\n",s,s1,s2);
 +  }
 +}
 +
 +static gmx_bool equal_real(real i1,real i2,real ftol,real abstol)
 +{
 +    return ( ( 2*fabs(i1 - i2) <= (fabs(i1) + fabs(i2))*ftol ) || fabs(i1-i2)<=abstol );
 +}
 +
 +static gmx_bool equal_float(float i1,float i2,float ftol,float abstol)
 +{
 +    return ( ( 2*fabs(i1 - i2) <= (fabs(i1) + fabs(i2))*ftol ) || fabs(i1-i2)<=abstol );
 +}
 +
 +static gmx_bool equal_double(double i1,double i2,real ftol,real abstol)
 +{
 +    return ( ( 2*fabs(i1 - i2) <= (fabs(i1) + fabs(i2))*ftol ) || fabs(i1-i2)<=abstol );
 +}
 +
 +static void 
 +cmp_real(FILE *fp,const char *s,int index,real i1,real i2,real ftol,real abstol)
 +{
 +  if (!equal_real(i1,i2,ftol,abstol)) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%2d] (%e - %e)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%e - %e)\n",s,i1,i2);
 +  }
 +}
 +
 +static void 
 +cmp_float(FILE *fp,const char *s,int index,float i1,float i2,float ftol,float abstol)
 +{
 +  if (!equal_float(i1,i2,ftol,abstol)) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%2d] (%e - %e)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%e - %e)\n",s,i1,i2);
 +  }
 +}
 +
 +
 +
 +static void 
 +cmp_double(FILE *fp,const char *s,int index,double i1,double i2,double ftol,double abstol)
 +{
 +  if (!equal_double(i1,i2,ftol,abstol)) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%2d] (%16.9e - %16.9e)\n",s,index,i1,i2);
 +    else
 +      fprintf(fp,"%s (%16.9e - %16.9e)\n",s,i1,i2);
 +  }
 +}
 +
 +static void cmp_rvec(FILE *fp,const char *s,int index,rvec i1,rvec i2,real ftol,real abstol)
 +{
 +    if(!equal_real(i1[XX],i2[XX],ftol,abstol) ||
 +       !equal_real(i1[YY],i2[YY],ftol,abstol) ||
 +       !equal_real(i1[ZZ],i2[ZZ],ftol,abstol))
 +    {
 +        if (index != -1)
 +            fprintf(fp,"%s[%5d] (%12.5e %12.5e %12.5e) - (%12.5e %12.5e %12.5e)\n",
 +                    s,index,i1[XX],i1[YY],i1[ZZ],i2[XX],i2[YY],i2[ZZ]);
 +        else
 +            fprintf(fp,"%s (%12.5e %12.5e %12.5e) - (%12.5e %12.5e %12.5e)\n",
 +                    s,i1[XX],i1[YY],i1[ZZ],i2[XX],i2[YY],i2[ZZ]);
 +    }
 +}
 +
 +static void cmp_ivec(FILE *fp,const char *s,int index,ivec i1,ivec i2)
 +{
 +  if ((i1[XX] != i2[XX]) || (i1[YY] != i2[YY]) || (i1[ZZ] != i2[ZZ])) {
 +    if (index != -1)
 +      fprintf(fp,"%s[%5d] (%8d,%8d,%8d - %8d,%8d,%8d)\n",s,index,
 +            i1[XX],i1[YY],i1[ZZ],i2[XX],i2[YY],i2[ZZ]);
 +    else
 +      fprintf(fp,"%s (%8d,%8d,%8d - %8d,%8d,%8d)\n",s,
 +            i1[XX],i1[YY],i1[ZZ],i2[XX],i2[YY],i2[ZZ]);
 +  }
 +}
 +
 +static void cmp_ilist(FILE *fp,int ftype,t_ilist *il1,t_ilist *il2)
 +{
 +  int i;
 +  char buf[256];
 + 
 +  fprintf(fp,"comparing ilist %s\n",interaction_function[ftype].name);
 +  sprintf(buf,"%s->nr",interaction_function[ftype].name);
 +  cmp_int(fp,buf,-1,il1->nr,il2->nr);
 +  sprintf(buf,"%s->iatoms",interaction_function[ftype].name);
 +  if (((il1->nr > 0) && (!il1->iatoms)) || 
 +      ((il2->nr > 0) && (!il2->iatoms)) ||
 +      ((il1->nr != il2->nr)))
 +    fprintf(fp,"Comparing radically different topologies - %s is different\n",
 +          buf);
 +  else
 +    for(i=0; (i<il1->nr); i++) 
 +      cmp_int(fp,buf,i,il1->iatoms[i],il2->iatoms[i]);
 +}
 +
 +void cmp_iparm(FILE *fp,const char *s,t_functype ft,
 +             t_iparams ip1,t_iparams ip2,real ftol,real abstol) 
 +{
 +  int i;
 +  gmx_bool bDiff;
 +  
 +  bDiff=FALSE;
 +  for(i=0; i<MAXFORCEPARAM && !bDiff; i++)
 +    bDiff = !equal_real(ip1.generic.buf[i],ip2.generic.buf[i],ftol,abstol);
 +  if (bDiff) {
 +    fprintf(fp,"%s1: ",s);
 +    pr_iparams(fp,ft,&ip1);
 +    fprintf(fp,"%s2: ",s);
 +    pr_iparams(fp,ft,&ip2);
 +  }
 +}
 +
 +void cmp_iparm_AB(FILE *fp,const char *s,t_functype ft,t_iparams ip1,real ftol,real abstol) 
 +{
 +  int nrfpA,nrfpB,p0,i;
 +  gmx_bool bDiff;
 +  
 +  /* Normally the first parameter is perturbable */
 +  p0 = 0;
 +  nrfpA = interaction_function[ft].nrfpA;
 +  nrfpB = interaction_function[ft].nrfpB;
 +  if (ft == F_PDIHS) {
 +    nrfpB = 2;
 +  } else if (interaction_function[ft].flags & IF_TABULATED) {
 +    /* For tabulated interactions only the second parameter is perturbable */
 +    p0 = 1;
 +    nrfpB = 1;
 +  }
 +  bDiff=FALSE;
 +  for(i=0; i<nrfpB && !bDiff; i++) {
 +    bDiff = !equal_real(ip1.generic.buf[p0+i],ip1.generic.buf[nrfpA+i],ftol,abstol);
 +  }
 +  if (bDiff) {
 +    fprintf(fp,"%s: ",s);
 +    pr_iparams(fp,ft,&ip1);
 +  }
 +}
 +
 +static void cmp_idef(FILE *fp,t_idef *id1,t_idef *id2,real ftol,real abstol)
 +{
 +  int i;
 +  char buf1[64],buf2[64];
 +  
 +  fprintf(fp,"comparing idef\n");
 +  if (id2) {
 +    cmp_int(fp,"idef->ntypes",-1,id1->ntypes,id2->ntypes);
 +    cmp_int(fp,"idef->atnr",  -1,id1->atnr,id2->atnr);
 +    for(i=0; (i<id1->ntypes); i++) {
 +      sprintf(buf1,"idef->functype[%d]",i);
 +      sprintf(buf2,"idef->iparam[%d]",i);
 +      cmp_int(fp,buf1,i,(int)id1->functype[i],(int)id2->functype[i]);
 +      cmp_iparm(fp,buf2,id1->functype[i],
 +              id1->iparams[i],id2->iparams[i],ftol,abstol);
 +    }
 +    cmp_real(fp,"fudgeQQ",-1,id1->fudgeQQ,id2->fudgeQQ,ftol,abstol);
 +    for(i=0; (i<F_NRE); i++)
 +      cmp_ilist(fp,i,&(id1->il[i]),&(id2->il[i]));
 +  } else {
 +    for(i=0; (i<id1->ntypes); i++)
 +      cmp_iparm_AB(fp,"idef->iparam",id1->functype[i],id1->iparams[i],ftol,abstol);
 +  }
 +}
 +
 +static void cmp_block(FILE *fp,t_block *b1,t_block *b2,const char *s)
 +{
 +  int i,j,k;
 +  char buf[32];
 +  
 +  fprintf(fp,"comparing block %s\n",s);
 +  sprintf(buf,"%s.nr",s);
 +  cmp_int(fp,buf,-1,b1->nr,b2->nr);
 +} 
 +
 +static void cmp_blocka(FILE *fp,t_blocka *b1,t_blocka *b2,const char *s)
 +{
 +  int i,j,k;
 +  char buf[32];
 +  
 +  fprintf(fp,"comparing blocka %s\n",s);
 +  sprintf(buf,"%s.nr",s);
 +  cmp_int(fp,buf,-1,b1->nr,b2->nr);
 +  sprintf(buf,"%s.nra",s);
 +  cmp_int(fp,buf,-1,b1->nra,b2->nra);
 +} 
 +
 +static void cmp_atom(FILE *fp,int index,t_atom *a1,t_atom *a2,real ftol,real abstol)
 +{
 +  int  i;
 +  char buf[256];
 +
 +  if (a2) {
 +    cmp_us(fp,"atom.type",index,a1->type,a2->type);
 +    cmp_us(fp,"atom.ptype",index,a1->ptype,a2->ptype);
 +    cmp_int(fp,"atom.resind",index,a1->resind,a2->resind);
 +    cmp_int(fp,"atom.atomnumber",index,a1->atomnumber,a2->atomnumber);
 +    cmp_real(fp,"atom.m",index,a1->m,a2->m,ftol,abstol);
 +    cmp_real(fp,"atom.q",index,a1->q,a2->q,ftol,abstol);
 +    cmp_us(fp,"atom.typeB",index,a1->typeB,a2->typeB);
 +    cmp_real(fp,"atom.mB",index,a1->mB,a2->mB,ftol,abstol);
 +    cmp_real(fp,"atom.qB",index,a1->qB,a2->qB,ftol,abstol);
 +  } else {
 +    cmp_us(fp,"atom.type",index,a1->type,a1->typeB);
 +    cmp_real(fp,"atom.m",index,a1->m,a1->mB,ftol,abstol);
 +    cmp_real(fp,"atom.q",index,a1->q,a1->qB,ftol,abstol);
 +  }
 +}
 +
 +static void cmp_atoms(FILE *fp,t_atoms *a1,t_atoms *a2,real ftol, real abstol)
 +{
 +  int i;
 +  
 +  fprintf(fp,"comparing atoms\n");
 +
 +  if (a2) {
 +    cmp_int(fp,"atoms->nr",-1,a1->nr,a2->nr);
 +    for(i=0; (i<a1->nr); i++)
 +      cmp_atom(fp,i,&(a1->atom[i]),&(a2->atom[i]),ftol,abstol);
 +  } else {
 +    for(i=0; (i<a1->nr); i++)
 +      cmp_atom(fp,i,&(a1->atom[i]),NULL,ftol,abstol);
 +  }
 +}
 +
 +static void cmp_top(FILE *fp,t_topology *t1,t_topology *t2,real ftol, real abstol)
 +{
 +  int i;
 +  
 +  fprintf(fp,"comparing top\n");
 +  if (t2) {
 +    cmp_idef(fp,&(t1->idef),&(t2->idef),ftol,abstol);
 +    cmp_atoms(fp,&(t1->atoms),&(t2->atoms),ftol,abstol);
 +    cmp_block(fp,&t1->cgs,&t2->cgs,"cgs");
 +    cmp_block(fp,&t1->mols,&t2->mols,"mols");
 +    cmp_blocka(fp,&t1->excls,&t2->excls,"excls");
 +  } else {
 +    cmp_idef(fp,&(t1->idef),NULL,ftol,abstol);
 +    cmp_atoms(fp,&(t1->atoms),NULL,ftol,abstol);
 +  }
 +}
 +
 +static void cmp_groups(FILE *fp,gmx_groups_t *g0,gmx_groups_t *g1,
 +                     int natoms0,int natoms1)
 +{
 +  int  i,j,ndiff;
 +  char buf[32];
 +
 +  fprintf(fp,"comparing groups\n");
 +
 +  for(i=0; i<egcNR; i++) {
 +    sprintf(buf,"grps[%d].nr",i);
 +    cmp_int(fp,buf,-1,g0->grps[i].nr,g1->grps[i].nr);
 +    if (g0->grps[i].nr == g1->grps[i].nr) {
 +      for(j=0; j<g0->grps[i].nr; j++) {
 +          sprintf(buf,"grps[%d].name[%d]",i,j);
 +          cmp_str(fp,buf,-1,
 +                  *g0->grpname[g0->grps[i].nm_ind[j]],
 +                  *g1->grpname[g1->grps[i].nm_ind[j]]);
 +      }
 +    }
 +    cmp_int(fp,"ngrpnr",i,g0->ngrpnr[i],g1->ngrpnr[i]);
 +    if (g0->ngrpnr[i] == g1->ngrpnr[i] && natoms0 == natoms1 && 
 +      (g0->grpnr[i] != NULL || g1->grpnr[i] != NULL)) {
 +      for(j=0; j<natoms0; j++) {
 +      cmp_int(fp,gtypes[i],j,ggrpnr(g0,i,j),ggrpnr(g1,i,j));
 +      }
 +    }
 +  }
 +  /* We have compared the names in the groups lists,
 +   * so we can skip the grpname list comparison.
 +   */
 +}
 +
 +static void cmp_rvecs(FILE *fp,const char *title,int n,rvec x1[],rvec x2[],
 +                    gmx_bool bRMSD,real ftol,real abstol)
 +{
 +  int i,m;
 +  double d,ssd;
 +
 +  if (bRMSD) {
 +    ssd = 0;
 +    for(i=0; (i<n); i++) {
 +      for(m=0; m<DIM; m++) {
 +      d = x1[i][m] - x2[i][m];
 +      ssd += d*d;
 +      }
 +    }
 +    fprintf(fp,"%s RMSD %g\n",title,sqrt(ssd/n));
 +  } else {
 +    for(i=0; (i<n); i++) {
 +      cmp_rvec(fp,title,i,x1[i],x2[i],ftol,abstol);
 +    }
 +  }
 +}
 +
 +
 +/* Similar to cmp_rvecs, but this routine scales the allowed absolute tolerance
 + * by the RMS of the force components of x1.
 + */
 +static void cmp_rvecs_rmstol(FILE *fp,const char *title,int n,rvec x1[],rvec x2[],
 +                             real ftol,real abstol)
 +{
 +    int i,m;
 +    double d;
 +    double ave_x1,rms_x1;
 +    
 +    /* It is tricky to compare real values, in particular forces that
 +     * are sums of lots of terms where the final value might be close to 0.0.
 +     * To get a reference magnitude we calculate the RMS value of each
 +     * component in x1, and then set the allowed absolute tolerance to the
 +     * relative tolerance times this RMS magnitude.
 +     */
 +    ave_x1 = 0.0;
 +    for(i=0;i<n;i++)
 +    {
 +        for(m=0;m<DIM;m++)
 +        {
 +            ave_x1 += x1[i][m];
 +        }
 +    }
 +    ave_x1 /= n*DIM;
 +
 +    rms_x1 = 0.0;
 +    for(i=0; (i<n); i++)
 +    {
 +        for(m=0;m<DIM;m++)
 +        {
 +            d       = x1[i][m] - ave_x1;
 +            rms_x1 += d*d;
 +        }
 +    }
 +    rms_x1 = sqrt(rms_x1/(DIM*n));
 +    /* And now do the actual comparision with a hopefully realistic abstol. */
 +    for(i=0; (i<n); i++)
 +    {
 +        cmp_rvec(fp,title,i,x1[i],x2[i],ftol,abstol*rms_x1);
 +    }
 +}
 +
 +static void cmp_grpopts(FILE *fp,t_grpopts *opt1,t_grpopts *opt2,real ftol, real abstol)
 +{
 +  int i,j;
 +  char buf1[256],buf2[256];
 +  
 +  cmp_int(fp,"inputrec->grpopts.ngtc",-1,  opt1->ngtc,opt2->ngtc);
 +  cmp_int(fp,"inputrec->grpopts.ngacc",-1, opt1->ngacc,opt2->ngacc);
 +  cmp_int(fp,"inputrec->grpopts.ngfrz",-1, opt1->ngfrz,opt2->ngfrz);
 +  cmp_int(fp,"inputrec->grpopts.ngener",-1,opt1->ngener,opt2->ngener);
 +  for(i=0; (i<min(opt1->ngtc,opt2->ngtc)); i++) {
 +    cmp_real(fp,"inputrec->grpopts.nrdf",i,opt1->nrdf[i],opt2->nrdf[i],ftol,abstol);
 +    cmp_real(fp,"inputrec->grpopts.ref_t",i,opt1->ref_t[i],opt2->ref_t[i],ftol,abstol);
 +    cmp_real(fp,"inputrec->grpopts.tau_t",i,opt1->tau_t[i],opt2->tau_t[i],ftol,abstol);
 +    cmp_int(fp,"inputrec->grpopts.annealing",i,opt1->annealing[i],opt2->annealing[i]);
 +    cmp_int(fp,"inputrec->grpopts.anneal_npoints",i,
 +          opt1->anneal_npoints[i],opt2->anneal_npoints[i]);
 +    if(opt1->anneal_npoints[i]==opt2->anneal_npoints[i]) {
 +      sprintf(buf1,"inputrec->grpopts.anneal_time[%d]",i);
 +      sprintf(buf2,"inputrec->grpopts.anneal_temp[%d]",i);
 +      for(j=0;j<opt1->anneal_npoints[i];j++) {
 +      cmp_real(fp,buf1,j,opt1->anneal_time[i][j],opt2->anneal_time[i][j],ftol,abstol);
 +      cmp_real(fp,buf2,j,opt1->anneal_temp[i][j],opt2->anneal_temp[i][j],ftol,abstol);
 +      }
 +    }
 +  }
 +  if (opt1->ngener == opt2->ngener) {
 +    for(i=0; i<opt1->ngener; i++)
 +      for(j=i; j<opt1->ngener; j++) {
 +      sprintf(buf1,"inputrec->grpopts.egp_flags[%d]",i);
 +      cmp_int(fp,buf1,j,
 +              opt1->egp_flags[opt1->ngener*i+j],
 +              opt2->egp_flags[opt1->ngener*i+j]);
 +      }
 +  }
 +  for(i=0; (i<min(opt1->ngacc,opt2->ngacc)); i++)
 +    cmp_rvec(fp,"inputrec->grpopts.acc",i,opt1->acc[i],opt2->acc[i],ftol,abstol);
 +  for(i=0; (i<min(opt1->ngfrz,opt2->ngfrz)); i++)
 +    cmp_ivec(fp,"inputrec->grpopts.nFreeze",i,opt1->nFreeze[i],opt2->nFreeze[i]);
 +}
 +
 +static void cmp_cosines(FILE *fp,const char *s,t_cosines c1[DIM],t_cosines c2[DIM],real ftol, real abstol)
 +{
 +  int i,m;
 +  char buf[256];
 +  
 +  for(m=0; (m<DIM); m++) {
 +    sprintf(buf,"inputrec->%s[%d]",s,m);
 +    cmp_int(fp,buf,0,c1->n,c2->n);
 +    for(i=0; (i<min(c1->n,c2->n)); i++) {
 +      cmp_real(fp,buf,i,c1->a[i],c2->a[i],ftol,abstol);
 +      cmp_real(fp,buf,i,c1->phi[i],c2->phi[i],ftol,abstol);
 +    }
 +  }
 +}
 +static void cmp_adress(FILE *fp,t_adress *ad1,t_adress *ad2,
 +                     real ftol,real abstol)
 +{
 +  cmp_int(fp,"ir->adress->type" ,-1,ad1->type,ad2->type);
 +  cmp_real(fp,"ir->adress->const_wf" ,-1,ad1->const_wf,ad2->const_wf,ftol,abstol);
 +  cmp_real(fp,"ir->adress->ex_width" ,-1,ad1->ex_width,ad2->ex_width,ftol,abstol);
 +  cmp_real(fp,"ir->adress->hy_width" ,-1,ad1->hy_width,ad2->hy_width,ftol,abstol);
 +  cmp_int(fp,"ir->adress->icor" ,-1,ad1->icor,ad2->icor);
 +  cmp_int(fp,"ir->adress->site" ,-1,ad1->site,ad2->site);
 +  cmp_rvec(fp,"ir->adress->refs" ,-1,ad1->refs,ad2->refs,ftol,abstol);
 +  cmp_real(fp,"ir->adress->ex_forcecap", -1,ad1->ex_forcecap,ad2->ex_forcecap,ftol,abstol);
 +}
 +
 +static void cmp_pull(FILE *fp,t_pull *pull1,t_pull *pull2,real ftol, real abstol)
 +{
 +  fprintf(fp,"WARNING: Both files use COM pulling, but comparing of the pull struct is not implemented (yet). The pull parameters could be the same or different.\n");
 +}
 +
 +static void cmp_simtempvals(FILE *fp,t_simtemp *simtemp1,t_simtemp *simtemp2, int n_lambda, real ftol, real abstol)
 +{
 +  int i;
 +  cmp_int(fp,"inputrec->simtempvals->eSimTempScale",-1,simtemp1->eSimTempScale,simtemp2->eSimTempScale);
 +  cmp_real(fp,"inputrec->simtempvals->simtemp_high",-1,simtemp1->simtemp_high,simtemp2->simtemp_high,ftol,abstol);
 +  cmp_real(fp,"inputrec->simtempvals->simtemp_low",-1,simtemp1->simtemp_low,simtemp2->simtemp_low,ftol,abstol);
 +  for(i=0; i<n_lambda; i++)
 +  {
 +      cmp_real(fp,"inputrec->simtempvals->temperatures",-1,simtemp1->temperatures[i],simtemp2->temperatures[i],ftol,abstol);
 +  }
 +}
 +
 +static void cmp_expandedvals(FILE *fp,t_expanded *expand1,t_expanded *expand2,int n_lambda, real ftol, real abstol)
 +{
 +  int i;
 +
 +  cmp_bool(fp,"inputrec->fepvals->bInit_weights",-1,expand1->bInit_weights,expand2->bInit_weights);
 +  cmp_bool(fp,"inputrec->fepvals->bWLoneovert",-1,expand1->bWLoneovert,expand2->bWLoneovert);
 +
 +  for(i=0; i<n_lambda; i++)
 +  {
 +      cmp_real(fp,"inputrec->expandedvals->init_lambda_weights",-1,
 +               expand1->init_lambda_weights[i],expand2->init_lambda_weights[i],ftol,abstol);
 +  }
 +
 +  cmp_int(fp,"inputrec->expandedvals->lambda-stats", -1,expand1->elamstats,expand2->elamstats);
 +  cmp_int(fp,"inputrec->expandedvals->lambda-mc-move", -1,expand1->elmcmove,expand2->elmcmove);
 +  cmp_int(fp,"inputrec->expandedvals->lmc-repeats",-1,expand1->lmc_repeats,expand2->lmc_repeats);
 +  cmp_int(fp,"inputrec->expandedvals->lmc-gibbsdelta",-1,expand1->gibbsdeltalam,expand2->gibbsdeltalam);
 +  cmp_int(fp,"inputrec->expandedvals->lmc-forced-nstart",-1,expand1->lmc_forced_nstart,expand2->lmc_forced_nstart);
 +  cmp_int(fp,"inputrec->expandedvals->lambda-weights-equil",-1,expand1->elmceq,expand2->elmceq);
 +  cmp_int(fp,"inputrec->expandedvals->,weight-equil-number-all-lambda",-1,expand1->equil_n_at_lam,expand2->equil_n_at_lam);
 +  cmp_int(fp,"inputrec->expandedvals->weight-equil-number-samples",-1,expand1->equil_samples,expand2->equil_samples);
 +  cmp_int(fp,"inputrec->expandedvals->weight-equil-number-steps",-1,expand1->equil_steps,expand2->equil_steps);
 +  cmp_real(fp,"inputrec->expandedvals->weight-equil-wl-delta",-1,expand1->equil_wl_delta,expand2->equil_wl_delta,ftol,abstol);
 +  cmp_real(fp,"inputrec->expandedvals->weight-equil-count-ratio",-1,expand1->equil_ratio,expand2->equil_ratio,ftol,abstol);
 +  cmp_bool(fp,"inputrec->expandedvals->symmetrized-transition-matrix",-1,expand1->bSymmetrizedTMatrix,expand2->bSymmetrizedTMatrix);
 +  cmp_int(fp,"inputrec->expandedvals->nstTij",-1,expand1->nstTij,expand2->nstTij);
 +  cmp_int(fp,"inputrec->expandedvals->mininum-var-min",-1,expand1->minvarmin,expand2->minvarmin); /*default is reasonable */
 +  cmp_int(fp,"inputrec->expandedvals->weight-c-range",-1,expand1->c_range,expand2->c_range); /* default is just C=0 */
 +  cmp_real(fp,"inputrec->expandedvals->wl-scale",-1,expand1->wl_scale,expand2->wl_scale,ftol,abstol);
 +  cmp_real(fp,"inputrec->expandedvals->init-wl-delta",-1,expand1->init_wl_delta,expand2->init_wl_delta,ftol,abstol);
 +  cmp_real(fp,"inputrec->expandedvals->wl-ratio",-1,expand1->wl_ratio,expand2->wl_ratio,ftol,abstol);
 +  cmp_int(fp,"inputrec->expandedvals->nstexpanded",-1,expand1->nstexpanded,expand2->nstexpanded);
 +  cmp_int(fp,"inputrec->expandedvals->lmc-seed",-1,expand1->lmc_seed,expand2->lmc_seed);
 +  cmp_real(fp,"inputrec->expandedvals->mc-temperature",-1,expand1->mc_temp,expand2->mc_temp,ftol,abstol);
 +}
 +
 +static void cmp_fepvals(FILE *fp,t_lambda *fep1,t_lambda *fep2,real ftol, real abstol)
 +{
 +  int i,j;
 +  cmp_int(fp,"inputrec->nstdhdl",-1,fep1->nstdhdl,fep2->nstdhdl);
 +  cmp_double(fp,"inputrec->fepvals->init_fep_state",-1,fep1->init_fep_state,fep2->init_fep_state,ftol,abstol);
 +  cmp_double(fp,"inputrec->fepvals->delta_lambda",-1,fep1->delta_lambda,fep2->delta_lambda,ftol,abstol);
 +  cmp_int(fp,"inputrec->fepvals->n_lambda",-1,fep1->n_lambda,fep2->n_lambda);
 +  for(i=0; i<efptNR;i++)
 +  {
 +      for(j=0; j<min(fep1->n_lambda,fep2->n_lambda); j++)
 +      {
 +          cmp_double(fp,"inputrec->fepvals->all_lambda",-1,fep1->all_lambda[i][j],fep2->all_lambda[i][j],ftol,abstol);
 +      }
 +  }
++  cmp_int(fp,"inputrec->fepvals->lambda_neighbors",1,fep1->lambda_neighbors,
++          fep2->lambda_neighbors);
 +  cmp_real(fp,"inputrec->fepvals->sc_alpha",-1,fep1->sc_alpha,fep2->sc_alpha,ftol,abstol);
 +  cmp_int(fp,"inputrec->fepvals->sc_power",-1,fep1->sc_power,fep2->sc_power);
 +  cmp_real(fp,"inputrec->fepvals->sc_r_power",-1,fep1->sc_r_power,fep2->sc_r_power,ftol,abstol);
 +  cmp_real(fp,"inputrec->fepvals->sc_sigma",-1,fep1->sc_sigma,fep2->sc_sigma,ftol,abstol);
 +  cmp_bool(fp,"inputrec->fepvals->bPrintEnergy",-1,fep1->bPrintEnergy,fep1->bPrintEnergy);
 +  cmp_bool(fp,"inputrec->fepvals->bScCoul",-1,fep1->bScCoul,fep1->bScCoul);
 +  cmp_int(fp,"inputrec->separate_dhdl_file",-1,fep1->separate_dhdl_file,fep2->separate_dhdl_file);
 +  cmp_int(fp,"inputrec->dhdl_derivatives",-1,fep1->dhdl_derivatives,fep2->dhdl_derivatives);
 +  cmp_int(fp,"inputrec->dh_hist_size",-1,fep1->dh_hist_size,fep2->dh_hist_size);
 +  cmp_double(fp,"inputrec->dh_hist_spacing",-1,fep1->dh_hist_spacing,fep2->dh_hist_spacing,ftol,abstol);
 +}
 +
 +static void cmp_inputrec(FILE *fp,t_inputrec *ir1,t_inputrec *ir2,real ftol, real abstol)
 +{
 +  fprintf(fp,"comparing inputrec\n");
 +
 +  /* gcc 2.96 doesnt like these defines at all, but issues a huge list
 +   * of warnings. Maybe it will change in future versions, but for the
 +   * moment I've spelled them out instead. /EL 000820 
 +   * #define CIB(s) cmp_int(fp,"inputrec->"#s,0,ir1->##s,ir2->##s)
 +   * #define CII(s) cmp_int(fp,"inputrec->"#s,0,ir1->##s,ir2->##s)
 +   * #define CIR(s) cmp_real(fp,"inputrec->"#s,0,ir1->##s,ir2->##s,ftol)
 +   */
 +  cmp_int(fp,"inputrec->eI",-1,ir1->eI,ir2->eI);
 +  cmp_gmx_large_int(fp,"inputrec->nsteps",ir1->nsteps,ir2->nsteps);
 +  cmp_gmx_large_int(fp,"inputrec->init_step",ir1->init_step,ir2->init_step);
 +  cmp_int(fp,"inputrec->simulation_part",-1,ir1->simulation_part,ir2->simulation_part);
 +  cmp_int(fp,"inputrec->ePBC",-1,ir1->ePBC,ir2->ePBC);
 +  cmp_int(fp,"inputrec->bPeriodicMols",-1,ir1->bPeriodicMols,ir2->bPeriodicMols);
 +  cmp_int(fp,"inputrec->cutoff_scheme",-1,ir1->cutoff_scheme,ir2->cutoff_scheme);
 +  cmp_int(fp,"inputrec->ns_type",-1,ir1->ns_type,ir2->ns_type);
 +  cmp_int(fp,"inputrec->nstlist",-1,ir1->nstlist,ir2->nstlist);
 +  cmp_int(fp,"inputrec->ndelta",-1,ir1->ndelta,ir2->ndelta);
 +  cmp_int(fp,"inputrec->nstcomm",-1,ir1->nstcomm,ir2->nstcomm);
 +  cmp_int(fp,"inputrec->comm_mode",-1,ir1->comm_mode,ir2->comm_mode);
 +  cmp_int(fp,"inputrec->nstcheckpoint",-1,ir1->nstcheckpoint,ir2->nstcheckpoint);
 +  cmp_int(fp,"inputrec->nstlog",-1,ir1->nstlog,ir2->nstlog);
 +  cmp_int(fp,"inputrec->nstxout",-1,ir1->nstxout,ir2->nstxout);
 +  cmp_int(fp,"inputrec->nstvout",-1,ir1->nstvout,ir2->nstvout);
 +  cmp_int(fp,"inputrec->nstfout",-1,ir1->nstfout,ir2->nstfout);
 +  cmp_int(fp,"inputrec->nstcalcenergy",-1,ir1->nstcalcenergy,ir2->nstcalcenergy);
 +  cmp_int(fp,"inputrec->nstenergy",-1,ir1->nstenergy,ir2->nstenergy);
 +  cmp_int(fp,"inputrec->nstxtcout",-1,ir1->nstxtcout,ir2->nstxtcout);
 +  cmp_double(fp,"inputrec->init_t",-1,ir1->init_t,ir2->init_t,ftol,abstol);
 +  cmp_double(fp,"inputrec->delta_t",-1,ir1->delta_t,ir2->delta_t,ftol,abstol);
 +  cmp_real(fp,"inputrec->xtcprec",-1,ir1->xtcprec,ir2->xtcprec,ftol,abstol);
 +  cmp_real(fp,"inputrec->fourierspacing",-1,ir1->fourier_spacing,ir2->fourier_spacing,ftol,abstol);
 +  cmp_int(fp,"inputrec->nkx",-1,ir1->nkx,ir2->nkx);
 +  cmp_int(fp,"inputrec->nky",-1,ir1->nky,ir2->nky);
 +  cmp_int(fp,"inputrec->nkz",-1,ir1->nkz,ir2->nkz);
 +  cmp_int(fp,"inputrec->pme_order",-1,ir1->pme_order,ir2->pme_order);
 +  cmp_real(fp,"inputrec->ewald_rtol",-1,ir1->ewald_rtol,ir2->ewald_rtol,ftol,abstol);
 +  cmp_int(fp,"inputrec->ewald_geometry",-1,ir1->ewald_geometry,ir2->ewald_geometry);
 +  cmp_real(fp,"inputrec->epsilon_surface",-1,ir1->epsilon_surface,ir2->epsilon_surface,ftol,abstol);
 +  cmp_int(fp,"inputrec->bOptFFT",-1,ir1->bOptFFT,ir2->bOptFFT);
 +  cmp_int(fp,"inputrec->bContinuation",-1,ir1->bContinuation,ir2->bContinuation);
 +  cmp_int(fp,"inputrec->bShakeSOR",-1,ir1->bShakeSOR,ir2->bShakeSOR);
 +  cmp_int(fp,"inputrec->etc",-1,ir1->etc,ir2->etc);
 +  cmp_int(fp,"inputrec->bPrintNHChains",-1,ir1->bPrintNHChains,ir2->bPrintNHChains);
 +  cmp_int(fp,"inputrec->epc",-1,ir1->epc,ir2->epc);
 +  cmp_int(fp,"inputrec->epct",-1,ir1->epct,ir2->epct);
 +  cmp_real(fp,"inputrec->tau_p",-1,ir1->tau_p,ir2->tau_p,ftol,abstol);
 +  cmp_rvec(fp,"inputrec->ref_p(x)",-1,ir1->ref_p[XX],ir2->ref_p[XX],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->ref_p(y)",-1,ir1->ref_p[YY],ir2->ref_p[YY],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->ref_p(z)",-1,ir1->ref_p[ZZ],ir2->ref_p[ZZ],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->compress(x)",-1,ir1->compress[XX],ir2->compress[XX],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->compress(y)",-1,ir1->compress[YY],ir2->compress[YY],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->compress(z)",-1,ir1->compress[ZZ],ir2->compress[ZZ],ftol,abstol);
 +  cmp_int(fp,"refcoord_scaling",-1,ir1->refcoord_scaling,ir2->refcoord_scaling);
 +  cmp_rvec(fp,"inputrec->posres_com",-1,ir1->posres_com,ir2->posres_com,ftol,abstol);
 +  cmp_rvec(fp,"inputrec->posres_comB",-1,ir1->posres_comB,ir2->posres_comB,ftol,abstol);
 +  cmp_real(fp,"inputrec->verletbuf_drift",-1,ir1->verletbuf_drift,ir2->verletbuf_drift,ftol,abstol);
 +  cmp_real(fp,"inputrec->rlist",-1,ir1->rlist,ir2->rlist,ftol,abstol);
 +  cmp_real(fp,"inputrec->rlistlong",-1,ir1->rlistlong,ir2->rlistlong,ftol,abstol);
 +  cmp_int(fp,"inputrec->nstcalclr",-1,ir1->nstcalclr,ir2->nstcalclr);
 +  cmp_real(fp,"inputrec->rtpi",-1,ir1->rtpi,ir2->rtpi,ftol,abstol);
 +  cmp_int(fp,"inputrec->coulombtype",-1,ir1->coulombtype,ir2->coulombtype);
 +  cmp_int(fp,"inputrec->coulomb_modifier",-1,ir1->coulomb_modifier,ir2->coulomb_modifier);
 +  cmp_real(fp,"inputrec->rcoulomb_switch",-1,ir1->rcoulomb_switch,ir2->rcoulomb_switch,ftol,abstol);
 +  cmp_real(fp,"inputrec->rcoulomb",-1,ir1->rcoulomb,ir2->rcoulomb,ftol,abstol);
 +  cmp_int(fp,"inputrec->vdwtype",-1,ir1->vdwtype,ir2->vdwtype);
 +  cmp_int(fp,"inputrec->vdw_modifier",-1,ir1->vdw_modifier,ir2->vdw_modifier);  cmp_real(fp,"inputrec->rvdw_switch",-1,ir1->rvdw_switch,ir2->rvdw_switch,ftol,abstol);
 +  cmp_real(fp,"inputrec->rvdw",-1,ir1->rvdw,ir2->rvdw,ftol,abstol);
 +  cmp_real(fp,"inputrec->epsilon_r",-1,ir1->epsilon_r,ir2->epsilon_r,ftol,abstol);
 +  cmp_real(fp,"inputrec->epsilon_rf",-1,ir1->epsilon_rf,ir2->epsilon_rf,ftol,abstol);
 +  cmp_real(fp,"inputrec->tabext",-1,ir1->tabext,ir2->tabext,ftol,abstol);
 +  cmp_int(fp,"inputrec->implicit_solvent",-1,ir1->implicit_solvent,ir2->implicit_solvent);
 +  cmp_int(fp,"inputrec->gb_algorithm",-1,ir1->gb_algorithm,ir2->gb_algorithm);
 +  cmp_int(fp,"inputrec->nstgbradii",-1,ir1->nstgbradii,ir2->nstgbradii);
 +  cmp_real(fp,"inputrec->rgbradii",-1,ir1->rgbradii,ir2->rgbradii,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_saltconc",-1,ir1->gb_saltconc,ir2->gb_saltconc,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_epsilon_solvent",-1,ir1->gb_epsilon_solvent,ir2->gb_epsilon_solvent,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_obc_alpha",-1,ir1->gb_obc_alpha,ir2->gb_obc_alpha,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_obc_beta",-1,ir1->gb_obc_beta,ir2->gb_obc_beta,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_obc_gamma",-1,ir1->gb_obc_gamma,ir2->gb_obc_gamma,ftol,abstol);
 +  cmp_real(fp,"inputrec->gb_dielectric_offset",-1,ir1->gb_dielectric_offset,ir2->gb_dielectric_offset,ftol,abstol);
 +  cmp_int(fp,"inputrec->sa_algorithm",-1,ir1->sa_algorithm,ir2->sa_algorithm);
 +  cmp_real(fp,"inputrec->sa_surface_tension",-1,ir1->sa_surface_tension,ir2->sa_surface_tension,ftol,abstol); 
 +
 +  cmp_int(fp,"inputrec->eDispCorr",-1,ir1->eDispCorr,ir2->eDispCorr);
 +  cmp_real(fp,"inputrec->shake_tol",-1,ir1->shake_tol,ir2->shake_tol,ftol,abstol);
 +  cmp_int(fp,"inputrec->efep",-1,ir1->efep,ir2->efep);
 +  cmp_fepvals(fp,ir1->fepvals,ir2->fepvals,ftol,abstol);
 +  cmp_int(fp,"inputrec->bSimTemp",-1,ir1->bSimTemp,ir2->bSimTemp);
 +  if ((ir1->bSimTemp == ir2->bSimTemp) && (ir1->bSimTemp))
 +  {
 +      cmp_simtempvals(fp,ir1->simtempvals,ir2->simtempvals,min(ir1->fepvals->n_lambda,ir2->fepvals->n_lambda),ftol,abstol);
 +  }
 +  cmp_int(fp,"inputrec->bExpanded",-1,ir1->bExpanded,ir2->bExpanded);
 +  if ((ir1->bExpanded == ir2->bExpanded) && (ir1->bExpanded))
 +  {
 +      cmp_expandedvals(fp,ir1->expandedvals,ir2->expandedvals,min(ir1->fepvals->n_lambda,ir2->fepvals->n_lambda),ftol,abstol);
 +  }
 +  cmp_int(fp,"inputrec->nwall",-1,ir1->nwall,ir2->nwall);
 +  cmp_int(fp,"inputrec->wall_type",-1,ir1->wall_type,ir2->wall_type);
 +  cmp_int(fp,"inputrec->wall_atomtype[0]",-1,ir1->wall_atomtype[0],ir2->wall_atomtype[0]);
 +  cmp_int(fp,"inputrec->wall_atomtype[1]",-1,ir1->wall_atomtype[1],ir2->wall_atomtype[1]);
 +  cmp_real(fp,"inputrec->wall_density[0]",-1,ir1->wall_density[0],ir2->wall_density[0],ftol,abstol);
 +  cmp_real(fp,"inputrec->wall_density[1]",-1,ir1->wall_density[1],ir2->wall_density[1],ftol,abstol);
 +  cmp_real(fp,"inputrec->wall_ewald_zfac",-1,ir1->wall_ewald_zfac,ir2->wall_ewald_zfac,ftol,abstol);
 +
 +  cmp_int(fp,"inputrec->ePull",-1,ir1->ePull,ir2->ePull);
 +  if (ir1->ePull == ir2->ePull && ir1->ePull != epullNO)
 +    cmp_pull(fp,ir1->pull,ir2->pull,ftol,abstol);
 +  
 +  cmp_int(fp,"inputrec->eDisre",-1,ir1->eDisre,ir2->eDisre);
 +  cmp_real(fp,"inputrec->dr_fc",-1,ir1->dr_fc,ir2->dr_fc,ftol,abstol);
 +  cmp_int(fp,"inputrec->eDisreWeighting",-1,ir1->eDisreWeighting,ir2->eDisreWeighting);
 +  cmp_int(fp,"inputrec->bDisreMixed",-1,ir1->bDisreMixed,ir2->bDisreMixed);
 +  cmp_int(fp,"inputrec->nstdisreout",-1,ir1->nstdisreout,ir2->nstdisreout);
 +  cmp_real(fp,"inputrec->dr_tau",-1,ir1->dr_tau,ir2->dr_tau,ftol,abstol);
 +  cmp_real(fp,"inputrec->orires_fc",-1,ir1->orires_fc,ir2->orires_fc,ftol,abstol);
 +  cmp_real(fp,"inputrec->orires_tau",-1,ir1->orires_tau,ir2->orires_tau,ftol,abstol);
 +  cmp_int(fp,"inputrec->nstorireout",-1,ir1->nstorireout,ir2->nstorireout);
 +  cmp_real(fp,"inputrec->dihre_fc",-1,ir1->dihre_fc,ir2->dihre_fc,ftol,abstol);
 +  cmp_real(fp,"inputrec->em_stepsize",-1,ir1->em_stepsize,ir2->em_stepsize,ftol,abstol);
 +  cmp_real(fp,"inputrec->em_tol",-1,ir1->em_tol,ir2->em_tol,ftol,abstol);
 +  cmp_int(fp,"inputrec->niter",-1,ir1->niter,ir2->niter);
 +  cmp_real(fp,"inputrec->fc_stepsize",-1,ir1->fc_stepsize,ir2->fc_stepsize,ftol,abstol);
 +  cmp_int(fp,"inputrec->nstcgsteep",-1,ir1->nstcgsteep,ir2->nstcgsteep);
 +  cmp_int(fp,"inputrec->nbfgscorr",0,ir1->nbfgscorr,ir2->nbfgscorr);
 +  cmp_int(fp,"inputrec->eConstrAlg",-1,ir1->eConstrAlg,ir2->eConstrAlg);
 +  cmp_int(fp,"inputrec->nProjOrder",-1,ir1->nProjOrder,ir2->nProjOrder);
 +  cmp_real(fp,"inputrec->LincsWarnAngle",-1,ir1->LincsWarnAngle,ir2->LincsWarnAngle,ftol,abstol);
 +  cmp_int(fp,"inputrec->nLincsIter",-1,ir1->nLincsIter,ir2->nLincsIter);
 +  cmp_real(fp,"inputrec->bd_fric",-1,ir1->bd_fric,ir2->bd_fric,ftol,abstol);
 +  cmp_int(fp,"inputrec->ld_seed",-1,ir1->ld_seed,ir2->ld_seed);
 +  cmp_real(fp,"inputrec->cos_accel",-1,ir1->cos_accel,ir2->cos_accel,ftol,abstol);
 +  cmp_rvec(fp,"inputrec->deform(a)",-1,ir1->deform[XX],ir2->deform[XX],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->deform(b)",-1,ir1->deform[YY],ir2->deform[YY],ftol,abstol);
 +  cmp_rvec(fp,"inputrec->deform(c)",-1,ir1->deform[ZZ],ir2->deform[ZZ],ftol,abstol);
 +
 +  
 +  cmp_bool(fp,"ir->bAdress->type" ,-1,ir1->bAdress,ir2->bAdress);
 +  if (ir1->bAdress && ir2->bAdress) {
 +      cmp_adress(fp,ir1->adress,ir2->adress,ftol,abstol);
 +  }
 +
 +  cmp_int(fp,"inputrec->userint1",-1,ir1->userint1,ir2->userint1);
 +  cmp_int(fp,"inputrec->userint2",-1,ir1->userint2,ir2->userint2);
 +  cmp_int(fp,"inputrec->userint3",-1,ir1->userint3,ir2->userint3);
 +  cmp_int(fp,"inputrec->userint4",-1,ir1->userint4,ir2->userint4);
 +  cmp_real(fp,"inputrec->userreal1",-1,ir1->userreal1,ir2->userreal1,ftol,abstol);
 +  cmp_real(fp,"inputrec->userreal2",-1,ir1->userreal2,ir2->userreal2,ftol,abstol);
 +  cmp_real(fp,"inputrec->userreal3",-1,ir1->userreal3,ir2->userreal3,ftol,abstol);
 +  cmp_real(fp,"inputrec->userreal4",-1,ir1->userreal4,ir2->userreal4,ftol,abstol);
 +  cmp_grpopts(fp,&(ir1->opts),&(ir2->opts),ftol,abstol);
 +  cmp_cosines(fp,"ex",ir1->ex,ir2->ex,ftol,abstol);
 +  cmp_cosines(fp,"et",ir1->et,ir2->et,ftol,abstol);
 +}
 +
 +static void comp_pull_AB(FILE *fp,t_pull *pull,real ftol,real abstol)
 +{
 +  int i;
 +
 +  for(i=0; i<pull->ngrp+1; i++) {
 +    fprintf(fp,"comparing pull group %d\n",i);
 +    cmp_real(fp,"pullgrp->k",-1,pull->grp[i].k,pull->grp[i].kB,ftol,abstol);
 +  }
 +}
 +
 +static void comp_state(t_state *st1, t_state *st2,
 +                     gmx_bool bRMSD,real ftol,real abstol)
 +{
 +  int i,j,nc;
 +
 +  fprintf(stdout,"comparing flags\n");
 +  cmp_int(stdout,"flags",-1,st1->flags,st2->flags);
 +  fprintf(stdout,"comparing box\n");
 +  cmp_rvecs(stdout,"box",DIM,st1->box,st2->box,FALSE,ftol,abstol);
 +  fprintf(stdout,"comparing box_rel\n");
 +  cmp_rvecs(stdout,"box_rel",DIM,st1->box_rel,st2->box_rel,FALSE,ftol,abstol);
 +  fprintf(stdout,"comparing boxv\n");
 +  cmp_rvecs(stdout,"boxv",DIM,st1->boxv,st2->boxv,FALSE,ftol,abstol);
 +  if (st1->flags & (1<<estSVIR_PREV)) {
 +    fprintf(stdout,"comparing shake vir_prev\n");
 +    cmp_rvecs_rmstol(stdout,"svir_prev",DIM,st1->svir_prev,st2->svir_prev,ftol,abstol);
 +  }
 +  if (st1->flags & (1<<estFVIR_PREV)) {
 +    fprintf(stdout,"comparing force vir_prev\n");
 +    cmp_rvecs_rmstol(stdout,"fvir_prev",DIM,st1->fvir_prev,st2->fvir_prev,ftol,abstol);
 +  }
 +  if (st1->flags & (1<<estPRES_PREV)) {
 +    fprintf(stdout,"comparing prev_pres\n");
 +    cmp_rvecs_rmstol(stdout,"pres_prev",DIM,st1->pres_prev,st2->pres_prev,ftol,abstol);
 +  }
 +  cmp_int(stdout,"ngtc",-1,st1->ngtc,st2->ngtc);
 +  cmp_int(stdout,"nhchainlength",-1,st1->nhchainlength,st2->nhchainlength);
 +  if (st1->ngtc == st2->ngtc && st1->nhchainlength == st2->nhchainlength){
 +    for(i=0; i<st1->ngtc; i++) {
 +      nc = i*st1->nhchainlength;
 +      for(j=0; j<nc; j++) {
 +      cmp_real(stdout,"nosehoover_xi",
 +               i,st1->nosehoover_xi[nc+j],st2->nosehoover_xi[nc+j],ftol,abstol);
 +      }
 +    }
 +  }
 +  cmp_int(stdout,"nnhpres",-1,st1->nnhpres,st2->nnhpres);
 +  if (st1->nnhpres == st2->nnhpres && st1->nhchainlength == st2->nhchainlength) {
 +    for(i=0; i<st1->nnhpres; i++) {
 +      nc = i*st1->nhchainlength;
 +      for(j=0; j<nc; j++) {
 +      cmp_real(stdout,"nosehoover_xi",
 +               i,st1->nhpres_xi[nc+j],st2->nhpres_xi[nc+j],ftol,abstol);
 +      }
 +    }
 +  }
 +
 +  cmp_int(stdout,"natoms",-1,st1->natoms,st2->natoms);
 +  if (st1->natoms == st2->natoms) {
 +    if ((st1->flags & (1<<estX)) && (st2->flags & (1<<estX))) {
 +      fprintf(stdout,"comparing x\n");
 +      cmp_rvecs(stdout,"x",st1->natoms,st1->x,st2->x,bRMSD,ftol,abstol);
 +    }
 +    if ((st1->flags & (1<<estV)) && (st2->flags & (1<<estV))) {
 +      fprintf(stdout,"comparing v\n");
 +      cmp_rvecs(stdout,"v",st1->natoms,st1->v,st2->v,bRMSD,ftol,abstol);
 +    }
 +  }
 +}
 +
 +void comp_tpx(const char *fn1,const char *fn2,
 +            gmx_bool bRMSD,real ftol,real abstol)
 +{
 +  const char  *ff[2];
 +  t_tpxheader sh[2];
 +  t_inputrec  ir[2];
 +  t_state     state[2];
 +  gmx_mtop_t  mtop[2];
 +  t_topology  top[2];
 +  int         i;
 +
 +  ff[0]=fn1;
 +  ff[1]=fn2;
 +  for(i=0; i<(fn2 ? 2 : 1); i++) {
 +    read_tpx_state(ff[i],&(ir[i]),&state[i],NULL,&(mtop[i]));
 +  }
 +  if (fn2) {
 +    cmp_inputrec(stdout,&ir[0],&ir[1],ftol,abstol);
 +    /* Convert gmx_mtop_t to t_topology.
 +     * We should implement direct mtop comparison,
 +     * but it might be useful to keep t_topology comparison as an option.
 +     */
 +    top[0] = gmx_mtop_t_to_t_topology(&mtop[0]);
 +    top[1] = gmx_mtop_t_to_t_topology(&mtop[1]);
 +    cmp_top(stdout,&top[0],&top[1],ftol,abstol);
 +    cmp_groups(stdout,&mtop[0].groups,&mtop[1].groups,
 +             mtop[0].natoms,mtop[1].natoms);
 +    comp_state(&state[0],&state[1],bRMSD,ftol,abstol);
 +  } else {
 +    if (ir[0].efep == efepNO) {
 +      fprintf(stdout,"inputrec->efep = %s\n",efep_names[ir[0].efep]);
 +    } else {
 +      if (ir[0].ePull != epullNO) {
 +      comp_pull_AB(stdout,ir->pull,ftol,abstol);
 +      }
 +      /* Convert gmx_mtop_t to t_topology.
 +       * We should implement direct mtop comparison,
 +       * but it might be useful to keep t_topology comparison as an option.
 +       */
 +      top[0] = gmx_mtop_t_to_t_topology(&mtop[0]);
 +      cmp_top(stdout,&top[0],NULL,ftol,abstol);
 +    }
 +  }
 +}
 +
 +void comp_frame(FILE *fp, t_trxframe *fr1, t_trxframe *fr2,
 +              gmx_bool bRMSD, real ftol,real abstol)
 +{
 +  fprintf(fp,"\n");
 +  cmp_int(fp,"flags",-1,fr1->flags,fr2->flags);
 +  cmp_int(fp,"not_ok",-1,fr1->not_ok,fr2->not_ok);
 +  cmp_int(fp,"natoms",-1,fr1->natoms,fr2->natoms);
 +  cmp_real(fp,"t0",-1,fr1->t0,fr2->t0,ftol,abstol);
 +  if (cmp_bool(fp,"bTitle",-1,fr1->bTitle,fr2->bTitle))
 +    cmp_str(fp,"title", -1, fr1->title, fr2->title);
 +  if (cmp_bool(fp,"bStep",-1,fr1->bStep,fr2->bStep))
 +    cmp_int(fp,"step",-1,fr1->step,fr2->step);
 +  cmp_int(fp,"step",-1,fr1->step,fr2->step);
 +  if (cmp_bool(fp,"bTime",-1,fr1->bTime,fr2->bTime))   
 +    cmp_real(fp,"time",-1,fr1->time,fr2->time,ftol,abstol);
 +  if (cmp_bool(fp,"bLambda",-1,fr1->bLambda,fr2->bLambda)) 
 +    cmp_real(fp,"lambda",-1,fr1->lambda,fr2->lambda,ftol,abstol);
 +  if (cmp_bool(fp,"bAtoms",-1,fr1->bAtoms,fr2->bAtoms))
 +    cmp_atoms(fp,fr1->atoms,fr2->atoms,ftol,abstol);
 +  if (cmp_bool(fp,"bPrec",-1,fr1->bPrec,fr2->bPrec))
 +    cmp_real(fp,"prec",-1,fr1->prec,fr2->prec,ftol,abstol);
 +  if (cmp_bool(fp,"bX",-1,fr1->bX,fr2->bX))
 +    cmp_rvecs(fp,"x",min(fr1->natoms,fr2->natoms),fr1->x,fr2->x,bRMSD,ftol,abstol);
 +  if (cmp_bool(fp,"bV",-1,fr1->bV,fr2->bV))
 +    cmp_rvecs(fp,"v",min(fr1->natoms,fr2->natoms),fr1->v,fr2->v,bRMSD,ftol,abstol);
 +  if (cmp_bool(fp,"bF",-1,fr1->bF,fr2->bF))
 +    cmp_rvecs_rmstol(fp,"f",min(fr1->natoms,fr2->natoms),fr1->f,fr2->f,ftol,abstol);
 +  if (cmp_bool(fp,"bBox",-1,fr1->bBox,fr2->bBox))
 +    cmp_rvecs(fp,"box",3,fr1->box,fr2->box,FALSE,ftol,abstol);
 +}
 +
 +void comp_trx(const output_env_t oenv,const char *fn1, const char *fn2, 
 +              gmx_bool bRMSD,real ftol,real abstol)
 +{
 +  int i;
 +  const char *fn[2];
 +  t_trxframe fr[2];
 +  t_trxstatus *status[2];
 +  gmx_bool b[2];
 +  
 +  fn[0]=fn1;
 +  fn[1]=fn2;
 +  fprintf(stderr,"Comparing trajectory files %s and %s\n",fn1,fn2);
 +  for (i=0; i<2; i++)
 +    b[i] = read_first_frame(oenv,&status[i],fn[i],&fr[i],TRX_READ_X|TRX_READ_V|TRX_READ_F);
 +  
 +  if (b[0] && b[1]) { 
 +    do {
 +      comp_frame(stdout, &(fr[0]), &(fr[1]), bRMSD, ftol, abstol);
 +      
 +      for (i=0; i<2; i++)
 +      b[i] = read_next_frame(oenv,status[i],&fr[i]);
 +    } while (b[0] && b[1]);
 +    
 +    for (i=0; i<2; i++) {
 +      if (b[i] && !b[1-i])
 +      fprintf(stdout,"\nEnd of file on %s but not on %s\n",fn[1-i],fn[i]);
 +      close_trj(status[i]);
 +    }
 +  }
 +  if (!b[0] && !b[1])
 +    fprintf(stdout,"\nBoth files read correctly\n");
 +}
 +
 +static real ener_tensor_diag(int n,int *ind1,int *ind2,
 +                           gmx_enxnm_t *enm1,
 +                           int *tensi,int i,
 +                           t_energy e1[],t_energy e2[])
 +{
 +  int  d1,d2;
 +  int  len;
 +  int  j;
 +  real prod1,prod2;
 +  int  nfound;
 +
 +  d1 = tensi[i]/DIM;
 +  d2 = tensi[i] - d1*DIM;
 +  
 +  /* Find the diagonal elements d1 and d2 */
 +  len = strlen(enm1[ind1[i]].name);
 +  prod1 = 1;
 +  prod2 = 1;
 +  nfound = 0;
 +  for(j=0; j<n; j++) {
 +    if (tensi[j] >= 0 &&
 +      strlen(enm1[ind1[j]].name) == len &&
 +      strncmp(enm1[ind1[i]].name,enm1[ind1[j]].name,len-2) == 0 &&
 +      (tensi[j] == d1*DIM+d1 || tensi[j] == d2*DIM+d2)) {
 +      prod1 *= fabs(e1[ind1[j]].e);
 +      prod2 *= fabs(e2[ind2[j]].e);
 +      nfound++;
 +    }
 +  }
 +
 +  if (nfound == 2) {
 +    return 0.5*(sqrt(prod1) + sqrt(prod2));
 +  } else {
 +    return 0;
 +  }
 +}
 +
 +static gmx_bool enernm_equal(const char *nm1,const char *nm2)
 +{
 +  int len1,len2;
 +
 +  len1 = strlen(nm1);
 +  len2 = strlen(nm2);
 +
 +  /* Remove " (bar)" at the end of a name */
 +  if (len1 > 6 && strcmp(nm1+len1-6," (bar)") == 0) {
 +    len1 -= 6;
 +  }
 +  if (len2 > 6 && strcmp(nm2+len2-6," (bar)") == 0) {
 +    len2 -= 6;
 +  }
 +
 +  return (len1 == len2 && gmx_strncasecmp(nm1,nm2,len1) == 0);
 +}
 +
 +static void cmp_energies(FILE *fp,int step1,int step2,
 +                       t_energy e1[],t_energy e2[],
 +                       gmx_enxnm_t *enm1,gmx_enxnm_t *enm2,
 +                       real ftol,real abstol,
 +                       int nre,int *ind1,int *ind2,int maxener)
 +{
 +  int  i,ii;
 +  int  *tensi,len,d1,d2;
 +  real ftol_i,abstol_i;
 +
 +  snew(tensi,maxener);
 +  /* Check for tensor elements ending on "-XX", "-XY", ... , "-ZZ" */
 +  for(i=0; (i<maxener); i++) {
 +    ii = ind1[i];
 +    tensi[i] = -1;
 +    len = strlen(enm1[ii].name);
 +    if (len > 3 && enm1[ii].name[len-3] == '-') {
 +      d1 = enm1[ii].name[len-2] - 'X';
 +      d2 = enm1[ii].name[len-1] - 'X';
 +      if (d1 >= 0 && d1 < DIM &&
 +        d2 >= 0 && d2 < DIM) {
 +      tensi[i] = d1*DIM + d2;
 +      }
 +    }
 +  }
 +  
 +  for(i=0; (i<maxener); i++) {
 +    /* Check if this is an off-diagonal tensor element */
 +    if (tensi[i] >= 0 && tensi[i] != 0 && tensi[i] != 4 && tensi[i] != 8) {
 +      /* Turn on the relative tolerance check (4 is maximum relative diff.) */
 +      ftol_i = 5;
 +      /* Do the relative tolerance through an absolute tolerance times
 +       * the size of diagonal components of the tensor.
 +       */
 +      abstol_i = ftol*ener_tensor_diag(nre,ind1,ind2,enm1,tensi,i,e1,e2);
 +      if (debug) {
 +      fprintf(debug,"tensor '%s' val %f diag %f\n",
 +              enm1[i].name,e1[i].e,abstol_i/ftol);
 +      }
 +      if (abstol_i > 0) {
 +      /* We found a diagonal, we need to check with the minimum tolerance */
 +      abstol_i = min(abstol_i,abstol);
 +      } else {
 +      /* We did not find a diagonal, ignore the relative tolerance check */
 +      abstol_i = abstol;
 +      }
 +    } else {
 +      ftol_i   = ftol;
 +      abstol_i = abstol;
 +    }
 +    if (!equal_real(e1[ind1[i]].e,e2[ind2[i]].e,ftol_i,abstol_i)) {
 +      fprintf(fp,"%-15s  step %3d:  %12g,  step %3d: %12g\n",
 +            enm1[ind1[i]].name,
 +            step1,e1[ind1[i]].e,
 +            step2,e2[ind2[i]].e);
 +    }
 +  }
 +
 +  sfree(tensi);
 +}
 +
 +#if 0
 +static void cmp_disres(t_enxframe *fr1,t_enxframe *fr2,real ftol, real abstol)
 +{
 +  int i;
 +  char bav[64],bt[64],bs[22];
 +    
 +  cmp_int(stdout,"ndisre",-1,fr1->ndisre,fr2->ndisre);
 +  if ((fr1->ndisre == fr2->ndisre) && (fr1->ndisre > 0)) {
 +    sprintf(bav,"step %s: disre rav",gmx_step_str(fr1->step,bs));
 +    sprintf(bt, "step %s: disre  rt",gmx_step_str(fr1->step,bs));
 +    for(i=0; (i<fr1->ndisre); i++) {
 +      cmp_real(stdout,bav,i,fr1->disre_rm3tav[i],fr2->disre_rm3tav[i],ftol,abstol);
 +      cmp_real(stdout,bt ,i,fr1->disre_rt[i]    ,fr2->disre_rt[i]    ,ftol,abstol);
 +    }
 +  }
 +}
 +#endif
 +
 +static void cmp_eblocks(t_enxframe *fr1,t_enxframe *fr2,real ftol, real abstol)
 +{
 +    int i,j,k;
 +    char buf[64],bs[22];
 +
 +    cmp_int(stdout,"nblock",-1,fr1->nblock,fr2->nblock);  
 +    if ((fr1->nblock == fr2->nblock) && (fr1->nblock > 0)) 
 +    {
 +        for(j=0; (j<fr1->nblock); j++) 
 +        {
 +            t_enxblock *b1, *b2; /* convenience vars */
 +
 +            b1=&(fr1->block[j]);
 +            b2=&(fr2->block[j]);
 +
 +            sprintf(buf,"step %s: block[%d]",gmx_step_str(fr1->step,bs),j);
 +            cmp_int(stdout,buf,-1,b1->nsub,b2->nsub);
 +            cmp_int(stdout,buf,-1,b1->id,b2->id);
 +
 +            if ( (b1->nsub==b2->nsub) && (b1->id==b2->id) )
 +            {
 +                for(i=0;i<b1->nsub;i++)
 +                {
 +                    t_enxsubblock *s1, *s2;
 +
 +                    s1=&(b1->sub[i]);
 +                    s2=&(b2->sub[i]);
 +
 +                    cmp_int(stdout, buf, -1, (int)s1->type, (int)s2->type);
 +                    cmp_gmx_large_int(stdout, buf, s1->nr, s2->nr);
 +
 +                    if ((s1->type == s2->type) && (s1->nr == s2->nr))
 +                    {
 +                        switch(s1->type)
 +                        {
 +                            case xdr_datatype_float:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_float(stdout, buf, i,
 +                                             s1->fval[k], s2->fval[k], 
 +                                             ftol, abstol);
 +                                }
 +                                break;
 +                            case xdr_datatype_double:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_double(stdout, buf, i,
 +                                             s1->dval[k], s2->dval[k], 
 +                                             ftol, abstol);
 +                                }
 +                                break;
 +                            case xdr_datatype_int:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_int(stdout, buf, i,
 +                                            s1->ival[k], s2->ival[k]);
 +                                }
 +                                break;
 +                            case xdr_datatype_large_int:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_gmx_large_int(stdout, buf, 
 +                                                      s1->lval[k], s2->lval[k]);
 +                                }
 +                                break;
 +                            case xdr_datatype_char:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_uc(stdout, buf, i,
 +                                           s1->cval[k], s2->cval[k]);
 +                                }
 +                                break;
 +                            case xdr_datatype_string:
 +                                for(k=0;k<s1->nr;k++)
 +                                {
 +                                    cmp_str(stdout, buf, i,
 +                                            s1->sval[k], s2->sval[k]);
 +                                }
 +                                break;
 +                            default:
 +                                gmx_incons("Unknown data type!!");
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void comp_enx(const char *fn1,const char *fn2,real ftol,real abstol,const char *lastener)
 +{
 +  int        nre,nre1,nre2,block;
 +  ener_file_t in1, in2;
 +  int        i,j,maxener,*ind1,*ind2,*have;
 +  char       buf[256];
 +  gmx_enxnm_t *enm1=NULL,*enm2=NULL;
 +  t_enxframe *fr1,*fr2;
 +  gmx_bool       b1,b2;
 +  
 +  fprintf(stdout,"comparing energy file %s and %s\n\n",fn1,fn2);
 +
 +  in1 = open_enx(fn1,"r");
 +  in2 = open_enx(fn2,"r");
 +  do_enxnms(in1,&nre1,&enm1);
 +  do_enxnms(in2,&nre2,&enm2);
 +  if (nre1 != nre2) {
 +    fprintf(stdout,"There are %d and %d terms in the energy files\n\n",
 +          nre1,nre2);
 +  } else {
 +    fprintf(stdout,"There are %d terms in the energy files\n\n",nre1);
 +  }
 +
 +  snew(ind1,nre1);
 +  snew(ind2,nre2);
 +  snew(have,nre2);
 +  nre = 0;
 +  for(i=0; i<nre1; i++) {
 +    for(j=0; j<nre2; j++) {
 +      if (enernm_equal(enm1[i].name,enm2[j].name)) {
 +      ind1[nre] = i;
 +      ind2[nre] = j;
 +      have[j] = 1;
 +      nre++;
 +      break;
 +      }
 +    }
 +    if (nre == 0 || ind1[nre-1] != i) {
 +      cmp_str(stdout,"enm",i,enm1[i].name,"-");
 +    }
 +  }
 +  for(i=0; i<nre2; i++) {
 +    if (have[i] == 0) {
 +      cmp_str(stdout,"enm",i,"-",enm2[i].name);
 +    }
 +  }
 +
 +  maxener = nre;
 +  for(i=0; i<nre; i++) {
 +    if ((lastener != NULL) && (strstr(enm1[i].name,lastener) != NULL)) {
 +      maxener=i+1;
 +      break;
 +    }
 +  }
 +
 +  fprintf(stdout,"There are %d terms to compare in the energy files\n\n",
 +        maxener);
 +
 +  for(i=0; i<maxener; i++) {
 +    cmp_str(stdout,"unit",i,enm1[ind1[i]].unit,enm2[ind2[i]].unit);
 +  }
 +  
 +  snew(fr1,1);
 +  snew(fr2,1);
 +  do { 
 +    b1 = do_enx(in1,fr1);
 +    b2 = do_enx(in2,fr2);
 +    if (b1 && !b2)
 +      fprintf(stdout,"\nEnd of file on %s but not on %s\n",fn2,fn1);
 +    else if (!b1 && b2) 
 +      fprintf(stdout,"\nEnd of file on %s but not on %s\n",fn1,fn2);
 +    else if (!b1 && !b2)
 +      fprintf(stdout,"\nFiles read successfully\n");
 +    else {
 +      cmp_real(stdout,"t",-1,fr1->t,fr2->t,ftol,abstol);
 +      cmp_int(stdout,"step",-1,fr1->step,fr2->step);
 +      /* We don't want to print the nre mismatch for every frame */
 +      /* cmp_int(stdout,"nre",-1,fr1->nre,fr2->nre); */
 +      if ((fr1->nre >= nre) && (fr2->nre >= nre))
 +      cmp_energies(stdout,fr1->step,fr1->step,fr1->ener,fr2->ener,
 +                   enm1,enm2,ftol,abstol,nre,ind1,ind2,maxener);
 +      /*cmp_disres(fr1,fr2,ftol,abstol);*/
 +      cmp_eblocks(fr1,fr2,ftol,abstol);
 +    }
 +  } while (b1 && b2);
 +    
 +  close_enx(in1);
 +  close_enx(in2);
 +  
 +  free_enxframe(fr2);
 +  sfree(fr2);
 +  free_enxframe(fr1);
 +  sfree(fr1);
 +}
index f580688ed8105d0e0a0af163b7ea02e9e98ca3fa,0000000000000000000000000000000000000000..4b42be324e290a1b72859e64132af73b8ee063fb
mode 100644,000000..100644
--- /dev/null
@@@ -1,489 -1,0 +1,492 @@@
-                   asize(desc),desc,0,NULL,&oenv);
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "main.h"
 +#include "macros.h"
 +#include "futil.h"
 +#include "statutil.h"
 +#include "copyrite.h"
 +#include "sysstuff.h"
 +#include "txtdump.h"
 +#include "gmx_fatal.h"
 +#include "xtcio.h"
 +#include "enxio.h"
 +#include "smalloc.h"
 +#include "names.h"
 +#include "gmxfio.h"
 +#include "tpxio.h"
 +#include "trnio.h"
 +#include "txtdump.h"
 +#include "gmxcpp.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +
 +#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +
 +
 +static void list_tpx(const char *fn, gmx_bool bShowNumbers,const char *mdpfn,
 +                     gmx_bool bSysTop)
 +{
 +  FILE *gp;
 +  int         fp,indent,i,j,**gcount,atot;
 +  t_state     state;
 +  rvec        *f=NULL;
 +  t_inputrec  ir;
 +  t_tpxheader tpx;
 +  gmx_mtop_t  mtop;
 +  gmx_groups_t *groups;
 +  t_topology  top;
 +
 +  read_tpxheader(fn,&tpx,TRUE,NULL,NULL);
 +
 +  read_tpx_state(fn,
 +               tpx.bIr  ? &ir : NULL,
 +               &state,tpx.bF ? f : NULL,
 +               tpx.bTop ? &mtop: NULL);
 +  
 +  if (mdpfn && tpx.bIr) {
 +    gp = gmx_fio_fopen(mdpfn,"w");
 +    pr_inputrec(gp,0,NULL,&(ir),TRUE);
 +    gmx_fio_fclose(gp);
 +  }
 +
 +  if (!mdpfn) {  
 +    if (bSysTop)
 +      top = gmx_mtop_t_to_t_topology(&mtop);
 +
 +    if (available(stdout,&tpx,0,fn)) {
 +      indent=0;
 +      indent=pr_title(stdout,indent,fn);
 +      pr_inputrec(stdout,0,"inputrec",tpx.bIr ? &(ir) : NULL,FALSE);
 +      
 +      indent = 0;
 +      pr_header(stdout,indent,"header",&(tpx));
 +      
 +      if (!bSysTop)
 +      pr_mtop(stdout,indent,"topology",&(mtop),bShowNumbers);
 +      else
 +      pr_top(stdout,indent,"topology",&(top),bShowNumbers);
 +
 +      pr_rvecs(stdout,indent,"box",tpx.bBox ? state.box : NULL,DIM);
 +      pr_rvecs(stdout,indent,"box_rel",tpx.bBox ? state.box_rel : NULL,DIM);
 +      pr_rvecs(stdout,indent,"boxv",tpx.bBox ? state.boxv : NULL,DIM);
 +      pr_rvecs(stdout,indent,"pres_prev",tpx.bBox ? state.pres_prev : NULL,DIM);
 +      pr_rvecs(stdout,indent,"svir_prev",tpx.bBox ? state.svir_prev : NULL,DIM);
 +      pr_rvecs(stdout,indent,"fvir_prev",tpx.bBox ? state.fvir_prev : NULL,DIM);
 +      /* leave nosehoover_xi in for now to match the tpr version */
 +      pr_doubles(stdout,indent,"nosehoover_xi",state.nosehoover_xi,state.ngtc);
 +      /*pr_doubles(stdout,indent,"nosehoover_vxi",state.nosehoover_vxi,state.ngtc);*/
 +      /*pr_doubles(stdout,indent,"therm_integral",state.therm_integral,state.ngtc);*/
 +      pr_rvecs(stdout,indent,"x",tpx.bX ? state.x : NULL,state.natoms);
 +      pr_rvecs(stdout,indent,"v",tpx.bV ? state.v : NULL,state.natoms);
 +      if (tpx.bF) {
 +      pr_rvecs(stdout,indent,"f",f,state.natoms);
 +      }
 +    }
 +    
 +    groups = &mtop.groups;
 +
 +    snew(gcount,egcNR);
 +    for(i=0; (i<egcNR); i++) 
 +      snew(gcount[i],groups->grps[i].nr);
 +    
 +    for(i=0; (i<mtop.natoms); i++) {
 +      for(j=0; (j<egcNR); j++) 
 +      gcount[j][ggrpnr(groups,j,i)]++;
 +    }
 +    printf("Group statistics\n");
 +    for(i=0; (i<egcNR); i++) {
 +      atot=0;
 +      printf("%-12s: ",gtypes[i]);
 +      for(j=0; (j<groups->grps[i].nr); j++) {
 +      printf("  %5d",gcount[i][j]);
 +      atot+=gcount[i][j];
 +      }
 +      printf("  (total %d atoms)\n",atot);
 +      sfree(gcount[i]);
 +    }
 +    sfree(gcount);
 +  }
 +  done_state(&state);
 +  sfree(f);
 +}
 +
 +static void list_top(const char *fn)
 +{
 +  int status,done;
 +#define BUFLEN 256
 +  char buf[BUFLEN];
 +  gmx_cpp_t handle;
 +  char *cppopts[] = { NULL };
 +
 +  status = cpp_open_file(fn,&handle,cppopts);
 +  if (status != 0) 
 +    gmx_fatal(FARGS,cpp_error(&handle,status));
 +  do {
 +    status = cpp_read_line(&handle,BUFLEN,buf);
 +    done = (status == eCPP_EOF);
 +    if (!done) {
 +      if (status != eCPP_OK)
 +      gmx_fatal(FARGS,cpp_error(&handle,status));
 +      else 
 +      printf("%s\n",buf);
 +    }
 +  } while (!done);
 +  status = cpp_close_file(&handle);
 +  if (status != eCPP_OK) 
 +    gmx_fatal(FARGS,cpp_error(&handle,status));
 +}
 +
 +static void list_trn(const char *fn)
 +{
 +  t_fileio    *fpread, *fpwrite;
 +  int         nframe,indent;
 +  char        buf[256];
 +  rvec        *x,*v,*f;
 +  matrix      box;
 +  t_trnheader trn;
 +  gmx_bool        bOK;
 +
 +  fpread  = open_trn(fn,"r"); 
 +  fpwrite = open_tpx(NULL,"w");
 +  gmx_fio_setdebug(fpwrite,TRUE);
 +  
 +  nframe = 0;
 +  while (fread_trnheader(fpread,&trn,&bOK)) {
 +    snew(x,trn.natoms);
 +    snew(v,trn.natoms);
 +    snew(f,trn.natoms);
 +    if (fread_htrn(fpread,&trn,
 +                 trn.box_size ? box : NULL,
 +                 trn.x_size   ? x : NULL,
 +                 trn.v_size   ? v : NULL,
 +                 trn.f_size   ? f : NULL)) {
 +      sprintf(buf,"%s frame %d",fn,nframe);
 +      indent=0;
 +      indent=pr_title(stdout,indent,buf);
 +      pr_indent(stdout,indent);
 +      fprintf(stdout,"natoms=%10d  step=%10d  time=%12.7e  lambda=%10g\n",
 +            trn.natoms,trn.step,trn.t,trn.lambda);
 +      if (trn.box_size)
 +      pr_rvecs(stdout,indent,"box",box,DIM);
 +      if (trn.x_size)
 +      pr_rvecs(stdout,indent,"x",x,trn.natoms);
 +      if (trn.v_size)
 +      pr_rvecs(stdout,indent,"v",v,trn.natoms);
 +      if (trn.f_size)
 +      pr_rvecs(stdout,indent,"f",f,trn.natoms);
 +    } 
 +    else
 +      fprintf(stderr,"\nWARNING: Incomplete frame: nr %d, t=%g\n",
 +            nframe,trn.t);
 +    
 +    sfree(x);
 +    sfree(v);
 +    sfree(f);
 +    nframe++;
 +  }
 +  if (!bOK)
 +    fprintf(stderr,"\nWARNING: Incomplete frame header: nr %d, t=%g\n",
 +          nframe,trn.t);
 +  close_tpx(fpwrite);
 +  close_trn(fpread);
 +}
 +
 +void list_xtc(const char *fn, gmx_bool bXVG)
 +{
 +  t_fileio *xd;
 +  int    indent;
 +  char   buf[256];
 +  rvec   *x;
 +  matrix box;
 +  int    nframe,natoms,step;
 +  real   prec,time;
 +  gmx_bool   bOK;
 +  
 +  xd = open_xtc(fn,"r");
 +  read_first_xtc(xd,&natoms,&step,&time,box,&x,&prec,&bOK);
 +              
 +  nframe=0;
 +  do {
 +    if (bXVG) {
 +      int i,d;
 +      
 +      fprintf(stdout,"%g",time);
 +      for(i=0; i<natoms; i++)
 +      for(d=0; d<DIM; d++)
 +        fprintf(stdout," %g",x[i][d]);
 +      fprintf(stdout,"\n");
 +    } else {
 +      sprintf(buf,"%s frame %d",fn,nframe);
 +      indent=0;
 +      indent=pr_title(stdout,indent,buf);
 +      pr_indent(stdout,indent);
 +      fprintf(stdout,"natoms=%10d  step=%10d  time=%12.7e  prec=%10g\n",
 +          natoms,step,time,prec);
 +      pr_rvecs(stdout,indent,"box",box,DIM);
 +      pr_rvecs(stdout,indent,"x",x,natoms);
 +    }
 +    nframe++;
 +  } while (read_next_xtc(xd,natoms,&step,&time,box,x,&prec,&bOK));
 +  if (!bOK)
 +    fprintf(stderr,"\nWARNING: Incomplete frame at time %g\n",time);
 +  close_xtc(xd);
 +}
 +
 +void list_trx(const char *fn,gmx_bool bXVG)
 +{
 +  int ftp;
 +  
 +  ftp = fn2ftp(fn);
 +  if (ftp == efXTC)
 +    list_xtc(fn,bXVG);
 +  else if ((ftp == efTRR) || (ftp == efTRJ))
 +    list_trn(fn);
 +  else
 +    fprintf(stderr,"File %s is of an unsupported type. Try using the command\n 'less %s'\n",
 +          fn,fn);
 +}
 +
 +void list_ene(const char *fn)
 +{
 +    int        ndr;
 +    ener_file_t in;
 +    gmx_bool       bCont;
 +    gmx_enxnm_t *enm=NULL;
 +    t_enxframe *fr;
 +    int        i,j,nre,b;
 +    real       rav,minthird;
 +    char       buf[22];
 +
 +    printf("gmxdump: %s\n",fn);
 +    in = open_enx(fn,"r");
 +    do_enxnms(in,&nre,&enm);
 +    assert(enm);
 +
 +    printf("energy components:\n");
 +    for(i=0; (i<nre); i++) 
 +        printf("%5d  %-24s (%s)\n",i,enm[i].name,enm[i].unit);
 +
 +    minthird=-1.0/3.0;
 +    snew(fr,1);
 +    do {
 +        bCont=do_enx(in,fr);
 +
 +        if (bCont) 
 +        {
 +            printf("\n%24s  %12.5e  %12s  %12s\n","time:",
 +                   fr->t,"step:",gmx_step_str(fr->step,buf));
 +            printf("%24s  %12s  %12s  %12s\n",
 +                   "","","nsteps:",gmx_step_str(fr->nsteps,buf));
 +            printf("%24s  %12.5e  %12s  %12s\n",
 +                   "delta_t:",fr->dt,"sum steps:",gmx_step_str(fr->nsum,buf));
 +            if (fr->nre == nre) {
 +                printf("%24s  %12s  %12s  %12s\n",
 +                       "Component","Energy","Av. Energy","Sum Energy");
 +                if (fr->nsum > 0) {
 +                    for(i=0; (i<nre); i++) 
 +                        printf("%24s  %12.5e  %12.5e  %12.5e\n",
 +                               enm[i].name,fr->ener[i].e,fr->ener[i].eav,
 +                               fr->ener[i].esum);
 +                } else {
 +                    for(i=0; (i<nre); i++) 
 +                        printf("%24s  %12.5e\n",
 +                               enm[i].name,fr->ener[i].e);
 +                }
 +            }
 +            for(b=0; b<fr->nblock; b++)
 +            {
 +                const char *typestr="";
 +
 +                t_enxblock *eb=&(fr->block[b]);
 +                printf("Block data %2d (%3d subblocks, id=%d)\n",
 +                     b, eb->nsub, eb->id);
 +
 +                if (eb->id < enxNR)
 +                    typestr=enx_block_id_name[eb->id];
 +                printf("  id='%s'\n", typestr);
 +                for(i=0;i<eb->nsub;i++)
 +                {
 +                    t_enxsubblock *sb=&(eb->sub[i]);
 +                    printf("  Sub block %3d (%5d elems, type=%s) values:\n", 
 +                           i, sb->nr, xdr_datatype_names[sb->type]);
 +
 +                    switch(sb->type)
 +                    {
 +                        case xdr_datatype_float:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d   %8.4f\n",j, sb->fval[j]);
 +                            break;
 +                        case xdr_datatype_double:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d   %10.6f\n",j, sb->dval[j]);
 +                            break;
 +                        case xdr_datatype_int:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d %10d\n",j, sb->ival[j]);
 +                            break;
 +                        case xdr_datatype_large_int:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d %s\n",
 +                                     j, gmx_step_str(sb->lval[j],buf));
 +                            break;
 +                        case xdr_datatype_char:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d %1c\n",j, sb->cval[j]);
 +                            break;
 +                        case xdr_datatype_string:
 +                            for(j=0;j<sb->nr;j++)
 +                                printf("%14d %80s\n",j, sb->sval[j]);
 +                            break;
 +                        default:
 +                            gmx_incons("Unknown subblock type");
 +                    }
 +                }
 +            }
 +        }
 +    } while (bCont);
 +
 +    close_enx(in);
 +
 +    free_enxframe(fr);
 +    sfree(fr);
 +    sfree(enm);
 +}
 +
 +static void list_mtx(const char *fn)
 +{
 +  int  nrow,ncol,i,j,k;
 +  real *full=NULL,value;
 +  gmx_sparsematrix_t * sparse=NULL;
 +
 +  gmx_mtxio_read(fn,&nrow,&ncol,&full,&sparse);
 +
 +  if (full == NULL) {
 +    snew(full,nrow*ncol);
 +    for(i=0;i<nrow*ncol;i++) {
 +      full[i] = 0;
 +    }
 +    
 +    for(i=0;i<sparse->nrow;i++) {
 +      for(j=0;j<sparse->ndata[i];j++) {
 +        k     = sparse->data[i][j].col;
 +        value = sparse->data[i][j].value;
 +        full[i*ncol+k] = value;
 +        full[k*ncol+i] = value;
 +      }
 +    }
 +    gmx_sparsematrix_destroy(sparse);
 +  }
 +
 +  printf("%d %d\n",nrow,ncol);
 +  for(i=0; i<nrow; i++) {
 +    for(j=0; j<ncol; j++) {
 +      printf(" %g",full[i*ncol+j]);
 +    }
 +    printf("\n");
 +  }
 +
 +  sfree(full);
 +}
 +
 +int cmain(int argc,char *argv[])
 +{
 +  const char *desc[] = {
 +    "[TT]gmxdump[tt] reads a run input file ([TT].tpa[tt]/[TT].tpr[tt]/[TT].tpb[tt]),",
 +    "a trajectory ([TT].trj[tt]/[TT].trr[tt]/[TT].xtc[tt]), an energy",
 +    "file ([TT].ene[tt]/[TT].edr[tt]), or a checkpoint file ([TT].cpt[tt])",
 +    "and prints that to standard output in a readable format.",
 +    "This program is essential for checking your run input file in case of",
 +    "problems.[PAR]",
 +    "The program can also preprocess a topology to help finding problems.",
 +    "Note that currently setting [TT]GMXLIB[tt] is the only way to customize",
 +    "directories used for searching include files.",
 +  };
++  const char *bugs[] = {
++    "Position restraint output from -sys -s is broken"
++  };
 +  t_filenm fnm[] = {
 +    { efTPX, "-s", NULL, ffOPTRD },
 +    { efTRX, "-f", NULL, ffOPTRD },
 +    { efEDR, "-e", NULL, ffOPTRD },
 +    { efCPT, NULL, NULL, ffOPTRD },
 +    { efTOP, "-p", NULL, ffOPTRD },
 +    { efMTX, "-mtx", "hessian", ffOPTRD }, 
 +    { efMDP, "-om", NULL, ffOPTWR }
 +  };
 +#define NFILE asize(fnm)
 +
 +  output_env_t oenv;
 +  /* Command line options */
 +  static gmx_bool bXVG=FALSE;
 +  static gmx_bool bShowNumbers=TRUE;
 +  static gmx_bool bSysTop=FALSE;
 +  t_pargs pa[] = {
 +    { "-xvg", FALSE, etBOOL, {&bXVG}, "HIDDENXVG layout for xtc" },
 +    { "-nr",FALSE, etBOOL, {&bShowNumbers},"Show index numbers in output (leaving them out makes comparison easier, but creates a useless topology)" },
 +    { "-sys", FALSE, etBOOL, {&bSysTop}, "List the atoms and bonded interactions for the whole system instead of for each molecule type" }
 +  };
 +  
 +  CopyRight(stderr,argv[0]);
 +  parse_common_args(&argc,argv,0,NFILE,fnm,asize(pa),pa,
++                    asize(desc),desc,asize(bugs),bugs,&oenv);
 +
 +
 +  if (ftp2bSet(efTPX,NFILE,fnm))
 +    list_tpx(ftp2fn(efTPX,NFILE,fnm),bShowNumbers,
 +           ftp2fn_null(efMDP,NFILE,fnm),bSysTop);
 +  else if (ftp2bSet(efTRX,NFILE,fnm)) 
 +    list_trx(ftp2fn(efTRX,NFILE,fnm),bXVG);
 +  else if (ftp2bSet(efEDR,NFILE,fnm))
 +    list_ene(ftp2fn(efEDR,NFILE,fnm));
 +  else if (ftp2bSet(efCPT,NFILE,fnm))
 +    list_checkpoint(ftp2fn(efCPT,NFILE,fnm),stdout);
 +  else if (ftp2bSet(efTOP,NFILE,fnm))
 +    list_top(ftp2fn(efTOP,NFILE,fnm));
 +  else if (ftp2bSet(efMTX,NFILE,fnm))
 +    list_mtx(ftp2fn(efMTX,NFILE,fnm));
 +    
 +  thanx(stderr);
 +
 +  return 0;
 +}
index d7d0eec6e1c4bd2d06ee91f79d6e67dc22165541,0000000000000000000000000000000000000000..93d385cf715e346a64ddd7f4b147a10b3c556a7b
mode 100644,000000..100644
--- /dev/null
@@@ -1,53 -1,0 +1,44 @@@
-     md.c          md_openmm.c   mdrun.c     membed.c
 +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/gmxpreprocess)
 +
 +set(MDRUN_SOURCES
 +    do_gct.c      gctio.c       genalg.c    ionize.c
- if(GMX_OPENMM) 
-     include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
-     link_directories(${OpenMM_LIBRARY_DIR}) 
-     # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
-     # if the same OpenMM installation is used for running and building 
-     add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
-     file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
-     add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
-     target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
-     set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})   
++    md.c          mdrun.c     membed.c
 +    pme_loadbal.c repl_ex.c     runner.c    xutils.c
 +    ../main.cpp)
 +
-     target_link_libraries(mdrun ${GMX_EXTRA_LIBRARIES} libgromacs ${GMX_OPENMM_LIBRARIES}
++if(GMX_OPENMM)
++    # Even though the OpenMM build has "moved to contrib", many things
++    # have be be done from within the scope of the CMakeLists.txt that
++    # builds its mdrun, and that is here
++    include(../contrib/BuildMdrunOpenMM)
 +endif(GMX_OPENMM)
 +
 +if(GMX_FAHCORE)
 +    add_library(fahcore ${MDRUN_SOURCES})
 +else(GMX_FAHCORE)
 +    add_executable(mdrun ${MDRUN_SOURCES})
 +    gmx_add_man_page(mdrun)
-     if(GMX_OPENMM AND MSVC)
-         set_target_properties(mdrun PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT")
-     endif()
++    target_link_libraries(mdrun ${GMX_EXTRA_LIBRARIES} libgromacs
 +        ${GMX_EXE_LINKER_FLAGS})
 +    set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}"
 +        COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 +    install(TARGETS mdrun DESTINATION ${BIN_INSTALL_DIR} COMPONENT mdrun)
 +
 +    # Create the custom install-mdrun target
 +    if (BUILD_SHARED_LIBS)
 +        # If shared libraries are used, we need to install the libraries in
 +        # addition to the mdrun binary.
 +        add_custom_target(install-mdrun
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMENT "Installing mdrun")
 +    else (BUILD_SHARED_LIBS)
 +        add_custom_target(install-mdrun
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMENT "Installing mdrun")
 +    endif (BUILD_SHARED_LIBS)
 +    add_dependencies(install-mdrun mdrun)
 +endif(GMX_FAHCORE)
index da949ade755c10c105406c0d5d0a9a81a2fb5a99,0000000000000000000000000000000000000000..cf1dd35ec32dd75e33cda2ab0cc0aa50664929af
mode 100644,000000..100644
--- /dev/null
@@@ -1,2153 -1,0 +1,2160 @@@
-     gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog,t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel,t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle,t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr,fplog,"step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step,sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel = 0;
 +    wallcycle_start(wcycle,ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
 +}
 +
 +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_nex,int repl_ex_seed,gmx_membed_t membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0[efptNR];
 +    gmx_bool       bGStatEveryStep,bGStat,bCalcVir,bCalcEner;
 +    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
 +               bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
 +               bBornRadii,bStartingFromCpt;
 +    gmx_bool   bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
 +    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
 +               bForceUpdate=FALSE,bCPT;
 +    int        mdof_flags;
 +    gmx_bool       bMasterState;
 +    int        force_flags,cglo_flags;
 +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
 +    int        i,m;
 +    t_trxstatus *status;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    t_state    *bufstate=NULL;   
 +    matrix     *scale_tot,pcoupl_mu,M,ebox;
 +    gmx_nlheur_t nlh;
 +    t_trxframe rerun_fr;
 +    gmx_repl_ex_t repl_ex=NULL;
 +    int        nchkpt=1;
 +    gmx_localtop_t *top;      
 +    t_mdebin *mdebin=NULL;
 +    df_history_t df_history;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +    gmx_rng_t mcrng=NULL;
 +    gmx_bool        bFFscan;
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_shellfc_t shellfc;
 +    int         count,nconverged=0;
 +    real        timestep=0;
 +    double      tcount=0;
 +    gmx_bool        bIonize=FALSE;
 +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
 +    gmx_bool        bAppend;
 +    gmx_bool        bResetCountersHalfMaxH=FALSE;
-     int         fep_state=0;
++    gmx_bool        bVV,bIterativeCase,bFirstIterate,bTemp,bPres,bTrotter;
 +    gmx_bool        bUpdateDoLR;
 +    real        mu_aver=0,dvdl;
 +    int         a0,a1,gnx=0,ii;
 +    atom_id     *grpindex=NULL;
 +    char        *grpname;
 +    t_coupl_rec *tcr=NULL;
 +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
 +    matrix      boxcopy={{0}},lastbox;
 +      tensor      tmpvir;
 +      real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
 +      real        vetanew = 0;
 +    int         lamnew=0;
 +    /* for FEP */
-     bIterations = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    int         nstfep;
 +    real        rate;
 +    double      cycles;
 +      real        saved_conserved_quantity = 0;
 +    real        last_ekin = 0;
 +      int         iter_i;
 +      t_extmass   MassQ;
 +    int         **trotter_seq; 
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t iterate;
 +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
 +                                          simulation stops. If equal to zero, don't
 +                                          communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
 +    pme_load_balancing_t pme_loadbal=NULL;
 +    double          cycles_pmes;
 +    gmx_bool        bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +    
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control 
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf,top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */ 
-     if (bIterations) 
++    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
++    gmx_iterate_init(&iterate,FALSE); /* The default value of iterate->bIterationActive is set to
++                                         false in this step.  The correct value, true or false,
++                                         is set at each step, as it depends on the frequency of temperature
++                                         and pressure control.*/
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +    
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,state_global->lambda,
 +            &(state_global->fep_state),lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f,top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history,&state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */    
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global,n_flexible_constraints(constr),
 +                                 (ir->bContinuation || 
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    if (DOMAINDECOMP(cr)) {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state,1);
 +        dd_init_local_state(cr->dd,state_global,state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout) {
 +            snew(f_global,state_global->natoms);
 +        }
 +    } else {
 +        if (PAR(cr)) {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +            pd_at_range(cr,&a0,&a1);
 +        } else {
 +            top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr,top,cr);
 +
 +        state = partdec_init_local_state(cr,state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +        if (vsite) {
 +            set_vsite_top(vsite,top,mdatoms,cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC) {
 +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +
 +        if (shellfc) {
 +            make_local_shells(cr,mdatoms,shellfc);
 +        }
 +
 +        init_bonded_thread_force_reduction(fr,&top->idef);
 +
 +        if (ir->pull && PAR(cr)) {
 +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            state,&f,mdatoms,top,fr,
 +                            vsite,shellfc,constr,
 +                            nrnb,wcycle,FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi",nfile,fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if ( Flags & MD_APPENDFILES )
 +            {
 +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    } 
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) 
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd,state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng,state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr) {
 +        if (!DOMAINDECOMP(cr))
 +            set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT,nfile,fnm);
 +    if (bTCR) {
 +        if (MASTER(cr)) {
 +            fprintf(stderr,"Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex,gnx);
 +        for(i=0; (i<gnx); i++) {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "repl_ex_nst",&repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
 +                                        repl_ex_nst,repl_ex_nex,repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        !bRerunMD)
 +    {
 +        pme_loadbal_init(&pme_loadbal,ir,state->box,fr->ic,fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for(m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +  
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
 +                  | (bVV ? CGLO_PRESSURE:0)
 +                  | (bVV ? CGLO_CONSTRAINT:0)
 +                  | (bRerunMD ? CGLO_RERUNMD:0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
 +    
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                    constr,NULL,FALSE,state->box,
 +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
 +    if (ir->eI == eiVVAK) {
 +        /* a second call to get the half step temperature initialized as well */ 
 +        /* we do the same call as above, but turn the pressure off -- internally to 
 +           compute_globals, this is recognized as a velocity verlet half-step 
 +           kinetic energy calculation.  This minimized excess variables, but 
 +           perhaps loses some logic?*/
 +        
 +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                        constr,NULL,FALSE,state->box,
 +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                        cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +    
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT)) 
 +    {
 +        for(i=0; (i<ir->opts.ngtc); i++) 
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
 +        } 
 +    }
 +    if (ir->eI != eiVV) 
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +    
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
-             /* for vv, the first half actually corresponds to the last step */
++    if (bIterativeCase)
 +    {
 +            bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan) 
 +    {
 +        snew(xcopy,state->natoms);
 +        snew(vcopy,state->natoms);
 +        copy_rvecn(state->x,xcopy,0,state->natoms);
 +        copy_rvecn(state->v,vcopy,0,state->natoms);
 +        copy_mat(state->box,boxcopy);
 +    } 
 +    
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
 +    
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr,FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr,"starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf,"%s","infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                        gmx_step_str(ir->init_step,sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr,"%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
 +                                    NULL,0);
 +    if ( chkpt_ret == 0 ) 
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps 
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv,&status,
 +                                             opt2fn("-rerun",nfile,fnm),
 +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms,top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box,fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !bStateFromCP;
 +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep    = FALSE;
 +    bSumEkinhOld = FALSE;
 +    bExchanged   = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
 +
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        if (bRerunMD) {
 +            if (rerun_fr.bStep) {
 +                step = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime) {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        } 
 +        else 
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +            {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +            
 +            set_current_lambdas(step,ir->fepvals,bRerunMD,&rerun_fr,state_global,state,lam0);
 +            bDoDHDL = do_per_step(step,ir->fepvals->nstdhdl);
 +            bDoFEP  = (do_per_step(step,nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step,ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn) 
 +        {
 +            update_annealing_target_temp(&(ir->opts),t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for(i=0; i<state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box,state_global->box);
 +            copy_mat(state_global->box,state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for(ii=0; (ii<state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii],state->x[ii]);
 +                copy_rvec(vcopy[ii],state->v[ii]);
 +            }
 +            copy_mat(boxcopy,state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +            
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh,bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        } 
 +
 +        /* check whether we should stop because another simulation has 
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
 +                 (multisim_nsteps != ir->nsteps) )  
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr, 
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep=TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii=bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
 +        {
 +            bBornRadii=TRUE;
 +        }
 +        
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                  (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog,step,state->box,graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd,state,state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle,ewcDOMDEC);
 +                dd_partition_system(fplog,step,cr,
 +                                    bMasterState,nstglobalcomm,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle,ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +            
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                            constr,NULL,FALSE,state->box,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +        
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
 +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
 +        }
 +        
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile,fnm,fr,
 +                                  mdatoms->nr,state->x,state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
-         bCalcVir = bCalcEner ||
-             (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
++            /* for vv, the first half of the integration actually corresponds
++               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
++               but the virial needs to be calculated on both the current step and the 'next' step. Future
++               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
++
 +            bCalcEner = do_per_step(step-1,ir->nstcalcenergy);
++            bCalcVir = bCalcEner ||
++                (ir->epc != epcNO && (do_per_step(step,ir->nstpcouple) || do_per_step(step-1,ir->nstpcouple)));
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step,ir->nstcalcenergy);
++            bCalcVir = bCalcEner ||
++                (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
 +        }
-             if (bIterations)
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step,nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +        
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +            );
 +        
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +            );
 +
 +        if(fr->bTwinRange)
 +        {
 +            if(do_per_step(step,ir->nstcalclr))
 +            {
 +                force_flags |= GMX_FORCE_DO_LR;
 +            }
 +        }
 +        
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
 +                                      ir,bNS,force_flags,
 +                                      bStopCM,top,top_global,
 +                                      constr,enerd,fcd,
 +                                      state,f,force_vir,mdatoms,
 +                                      nrnb,wcycle,graph,groups,
 +                                      shellfc,fr,bBornRadii,t,mu_tot,
 +                                      state->natoms,&bConverged,vsite,
 +                                      outf->fp_field);
 +            tcount+=count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too. 
 +             * Check comments in sim_util.c
 +             */
 +             do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
 +                     state->box,state->x,&state->hist,
 +                     f,force_vir,mdatoms,enerd,fcd,
 +                     state->lambda,graph,
 +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +        
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
 +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
 +        }
 +        
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
 +            fprintf(fplog,"Done init_coupling\n"); 
 +            fflush(fplog);
 +        }
 +        
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI==eiVV && bInitStep) 
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the 
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
 +            } else {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
 +            }
 +
 +            /* If we are using twin-range interactions where the long-range component
 +             * is only evaluated every nstcalclr>1 steps, we should do a special update
 +             * step to combine the long-range forces on these steps.
 +             * For nstcalclr=1 this is not done, since the forces would have been added
 +             * directly to the short-range forces already.
 +             */
 +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step,ir->nstcalclr));
 +            
 +            update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,
 +                          f,bUpdateDoLR,fr->f_twin,fcd,
 +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
 +                          cr,nrnb,constr,&top->idef);
 +            
-                 gmx_iterate_init(&iterate,bIterations && !bInitStep);
++            if (bIterativeCase && do_per_step(step-1,ir->nstpcouple) && !bInitStep)
 +            {
-             if (bIterations && iterate.bIterate) { 
++                gmx_iterate_init(&iterate,TRUE);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +            
 +            /* save the state */
-             while (bFirstIterate || (bIterations && iterate.bIterate))
++            if (iterate.bIterationActive) {
 +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +            }
 +            
 +            bFirstIterate = TRUE;
-                 if (bIterations && iterate.bIterate) 
++            while (bFirstIterate || iterate.bIterationActive)
 +            {
-                 
++                if (iterate.bIterationActive)
 +                {
 +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +                    if (bFirstIterate && bTrotter) 
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +                        
 +                        veta_save = state->veta;
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
 +                        vetanew = state->veta;
 +                        state->veta = veta_save;
 +                    } 
 +                } 
 +                
 +                bOK = TRUE;
 +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
 +                    dvdl = 0;
 +                    
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                       state,fr->bMolPBC,graph,f,
 +                                       &top->idef,shake_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcVir,vetanew);
 +                    
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +                    
 +                } 
 +                else if (graph)
 +                { /* Need to unshift here if a do_force has been
 +                     called in the previous step */
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                
-                     /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-                 compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
-                                 wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
-                                 constr,NULL,FALSE,state->box,
-                                 top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
-                                 cglo_flags 
-                                 | CGLO_ENERGY 
-                                 | (bTemp ? CGLO_TEMPERATURE:0) 
-                                 | (bPres ? CGLO_PRESSURE : 0) 
-                                 | (bPres ? CGLO_CONSTRAINT : 0)
-                                 | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
-                                 | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-                                 | CGLO_SCALEEKIN 
-                     );
-                 /* explanation of above: 
-                    a) We compute Ekin at the full time step
-                    if 1) we are using the AveVel Ekin, and it's not the
-                    initial step, or 2) if we are using AveEkin, but need the full
-                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                    b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
-                    EkinAveVel because it's needed for the pressure */
-                 
++                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
 +                if (bCalcEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
-                 if (bIterations &&
++                /* for vv, the first half of the integration actually corresponds to the previous step.
++                   So we need information from the last step in the first half of the integration */
++                if (bGStat || do_per_step(step-1,nstglobalcomm)) {
++                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
++                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
++                                    constr,NULL,FALSE,state->box,
++                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
++                                    cglo_flags
++                                    | CGLO_ENERGY
++                                    | (bTemp ? CGLO_TEMPERATURE:0)
++                                    | (bPres ? CGLO_PRESSURE : 0)
++                                    | (bPres ? CGLO_CONSTRAINT : 0)
++                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
++                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
++                                    | CGLO_SCALEEKIN
++                        );
++                    /* explanation of above:
++                       a) We compute Ekin at the full time step
++                       if 1) we are using the AveVel Ekin, and it's not the
++                       initial step, or 2) if we are using AveEkin, but need the full
++                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
++                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
++                       EkinAveVel because it's needed for the pressure */
++                }
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep) 
 +                {
 +                    if (bTrotter)
 +                    {
 +                        m_add(force_vir,shake_vir,total_vir); /* we need the un-dispersion corrected total vir here */
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
 +                    } 
 +                    else 
 +                    {
 +                        if (bExchanged)
 +                        {
 +            
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                                            constr,NULL,FALSE,state->box,
 +                                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +                    }
 +                }
 +                
-         if (bIterations)
-         {
-             gmx_iterate_init(&iterate,bIterations);
-         }
-     
-         /* for iterations, we save these vectors, as we will be redoing the calculations */
-         if (bIterations && iterate.bIterate) 
++                if (iterate.bIterationActive &&
 +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                                   state->veta,&vetanew)) 
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep) {
 +                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
 +                copy_mat(shake_vir,state->svir_prev);
 +                copy_mat(force_vir,state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI==eiVV) {
 +                copy_rvecn(cbuf,state->v,0,state->natoms);
 +            }
 +            
 +            if (fr->bSepDVDL && fplog && do_log) 
 +            {
 +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +            }
 +            enerd->term[F_DVDL_BONDED] += dvdl;
 +        }
 +    
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV) {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
 +            if (ir->eI==eiVV) 
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            if (!bRerunMD)
 +            {
 +                sum_dhdl(enerd,state->lambda,ir->fepvals);
 +            }
 +        }
 +        
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded) {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +        
 +            lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +        
 +        /* Now we have the energies and forces corresponding to the 
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
 +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
 +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
 +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
 +        if (bCPT) { mdof_flags |= MDOF_CPT; };
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +            fcReportProgress( ir->nsteps, step );
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +            fcRequestCheckPoint();
 +#endif
 +        
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd,state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng,state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate,ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist,mdebin);
 +                    if (ir->efep!=efepNO || ir->bSimTemp) 
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist,&df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +        
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV) 
 +        {
 +            copy_mat(state->svir_prev,shake_vir);
 +            copy_mat(state->fvir_prev,force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +        
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */    
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.sig[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.sig[eglsSTOPCOND]=-1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
 +                                                     nlh.scale_tot,state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 || 
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +  
 +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
 +        if (EI_VV(ir->eI))
 +        {
 +            if (!bInitStep)
 +            {
 +                update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +            }
 +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
 +            {
 +                gmx_bool bIfRandomize;
 +                bIfRandomize = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
 +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +                if (constr && bIfRandomize)
 +                {
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                       state,fr->bMolPBC,graph,f,
 +                                       &top->idef,tmp_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcVir,vetanew);
 +                }
 +            }
 +        }
 +
-         while (bFirstIterate || (bIterations && iterate.bIterate))
++        if (bIterativeCase && do_per_step(step,ir->nstpcouple))
 +        {
++            gmx_iterate_init(&iterate,TRUE);
++            /* for iterations, we save these vectors, as we will be redoing the calculations */
 +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +        }
++
 +        bFirstIterate = TRUE;
-             if (bIterations) 
++        while (bFirstIterate || iterate.bIterationActive)
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */    
-                     if (bIterations && iterate.bIterate) 
++            if (iterate.bIterationActive)
 +            { 
 +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +            
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box,lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle,ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter) 
 +                {
-             /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
++                    if (iterate.bIterationActive)
 +                    {
 +                        if (bFirstIterate) 
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else 
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir,scalevir,shake_vir); 
 +                        m_add(force_vir,shake_vir,total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
 +                /* We can only do Berendsen coupling after we have summed
 +                 * the kinetic energy or virial. Since the happens
 +                 * in global_state after update, we should only do it at
 +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                 */
 +                }
 +                else 
 +                {
 +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
 +                                   upd,bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step,ir->nstcalclr));
 +
 +                    /* velocity half-step update */
 +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                                  bUpdateDoLR,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
 +                                  cr,nrnb,constr,&top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    copy_rvecn(state->x,cbuf,0,state->natoms);
 +                }
 +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step,ir->nstcalclr));
 +
 +                update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                              bUpdateDoLR,fr->f_twin,fcd,
 +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,
 +                                   fr->bMolPBC,graph,f,
 +                                   &top->idef,shake_vir,force_vir,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,FALSE,bCalcVir,state->veta);  
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                    constr,NULL,FALSE,lastbox,
 +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE    
 +                        );
 +                    wallcycle_start(wcycle,ewcUPDATE);
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf,state->x,0,state->natoms);
 +
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step,ir->nstcalclr));
 +
 +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                                  bUpdateDoLR,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                    wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure. 
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                       state,fr->bMolPBC,graph,f,
 +                                       &top->idef,tmp_vir,force_vir,
 +                                       cr,nrnb,wcycle,upd,NULL,
 +                                       bInitStep,FALSE,bCalcVir,
 +                                       state->veta);  
 +                }
 +                if (!bOK && !bFFscan) 
 +                {
 +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +                
 +                if (fr->bSepDVDL && fplog && do_log) 
 +                {
 +                    fprintf(fplog,sepdvdlformat,"Constraint dV/dl",0.0,dvdl);
 +                }
 +                enerd->term[F_DVDL_BONDED] += dvdl;
 +            } 
 +            else if (graph) 
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph,state->box,state->x);
 +            }
 +
 +            if (vsite != NULL) 
 +            {
 +                wallcycle_start(wcycle,ewcVSITECONSTR);
 +                if (graph != NULL) 
 +                {
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                
 +                if (graph != NULL) 
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                wallcycle_stop(wcycle,ewcVSITECONSTR);
 +            }
 +            
-             if (bGStat || do_per_step(step+1,nstglobalcomm) ||
-                 EI_VV(ir->eI))
++            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
-                                 | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
++            if (bGStat || (!EI_VV(ir->eI)&&do_per_step(step+1,nstglobalcomm)))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL, 
 +                                (step_rel % gs.nstms == 0) && 
 +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
 +                                lastbox,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
-             if (bIterations && 
++                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT 
 +                    );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +        
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
++            if (iterate.bIterationActive &&
 +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                               trace(shake_vir),&tracevir)) 
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        /* only add constraint dvdl after constraints */
 +        enerd->term[F_DVDL_BONDED] += dvdl;
 +        if (!bVV || bRerunMD)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd,state->lambda,ir->fepvals);
 +        }
 +        update_box(fplog,step,ir,mdatoms,state,graph,f,
 +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
 +        
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +    
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc==NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
 +                                 f,NULL,xcopy,
 +                                 &(top_global->mols),mdatoms->massT,pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr,"\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +        
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies. 
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
 +                        ir,MASTER(cr),
 +                        mdatoms,&(top->idef),mu_aver,
 +                        top_global->mols.nr,cr,
 +                        state->box,total_vir,pres,
 +                        mu_tot,state->x,f,bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI==eiVV) 
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +        
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else 
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize) 
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
 +            {
 +                fprintf(stderr,"Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep) 
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +        
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr,do_or;
 +            
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog,ir->fepvals,ir->expandedvals,ir->bSimTemp?ir->simtempvals:NULL,
 +                                          &df_history,state->fep_state,ir->nstlog,step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin,bDoDHDL, TRUE,
 +                               t,mdatoms->tmass,enerd,state,
 +                               ir->fepvals,ir->expandedvals,lastbox,
 +                               shake_vir,force_vir,total_vir,pres,
 +                               ekind,mu_tot,constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +                
 +                do_dr  = do_per_step(step,ir->nstdisreout);
 +                do_or  = do_per_step(step,ir->nstorireout);
 +                
 +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
 +                           step,t,
 +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull,step,t);
 +            }
 +            
 +            if (do_per_step(step,ir->nstlog))
 +            {
 +                if(fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i=0;i<efptNR;i++)
 +            {
 +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc) 
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step,repl_ex_nst)) 
 +        {
 +            bExchanged = replica_exchange(fplog,cr,repl_ex,
 +                                          state_global,enerd,
 +                                          state,step,t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr)) 
 +            {
 +                dd_partition_system(fplog,step,cr,TRUE,1,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,FALSE);
 +            }
 +        }
 +        
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres,state->pres_prev);
 +        }
 +        
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed!=NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel,membed,state_global->x);
 +        }
 +
 +        if (bRerunMD) 
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +            }
 +        }
 +        
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +        
 +        cycles = wallcycle_stop(wcycle,ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd,sizeof(gmx_bool),&bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
 +                        pme_load_balance(pme_loadbal,cr,
 +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
 +                                         fplog,
 +                                         ir,state,cycles_pmes,
 +                                         fr->ic,fr->nbv,&fr->pmedata,
 +                                         step);
 +
 +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                    fr->rlist      = fr->ic->rlist;
 +                    fr->rlistlong  = fr->ic->rlistlong;
 +                    fr->rcoulomb   = fr->ic->rcoulomb;
 +                    fr->rvdw       = fr->ic->rvdw;
 +                }
 +                cycles_pmes = 0;
 +            }
 +        }
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle,-1);
 +            /* Correct max_hours for the elapsed time */
 +            max_hours -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +    
 +    /* Stop the time */
 +    runtime_end(runtime);
 +    
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
 +    }
 +
 +    if (pme_loadbal != NULL)
 +    {
 +        pme_loadbal_done(pme_loadbal,fplog);
 +    }
 +
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +    
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog,repl_ex);
 +    }
 +    
 +    runtime->nsteps_done = step_rel;
 +
 +   return 0;
 +}
index 43f9d0112c5ccda13a191df842213a561f784b03,0000000000000000000000000000000000000000..c801a2f684353fc2b231d2da4df1420941dd408b
mode 100644,000000..100644
--- /dev/null
@@@ -1,808 -1,0 +1,740 @@@
-  #ifdef GMX_OPENMM
-     "This is an experimental release of GROMACS for accelerated",
-       "Molecular Dynamics simulations on GPU processors. Support is provided",
-       "by the OpenMM library (https://simtk.org/home/openmm).[PAR]",
-       "*Warning*[BR]",
-       "This release is targeted at developers and advanced users and",
-       "care should be taken before production use. The following should be",
-       "noted before using the program:[PAR]",
-       " * The current release runs only on modern nVidia GPU hardware with CUDA support.",
-       "Make sure that the necessary CUDA drivers and libraries for your operating system",
-       "are already installed. The CUDA SDK also should be installed in order to compile",
-       "the program from source (http://www.nvidia.com/object/cuda_home.html).[PAR]",
-       " * Multiple GPU cards are not supported.[PAR]",
-       " * Only a small subset of the GROMACS features and options are supported on the GPUs.",
-       "See below for a detailed list.[PAR]",
-       " * Consumer level GPU cards are known to often have problems with faulty memory.",
-       "It is recommended that a full memory check of the cards is done at least once",
-       "(for example, using the memtest=full option).",
-       "A partial memory check (for example, memtest=15) before and",
-       "after the simulation run would help spot",
-       "problems resulting from processor overheating.[PAR]",
-       " * The maximum size of the simulated systems depends on the available",
-       "GPU memory,for example, a GTX280 with 1GB memory has been tested with systems",
-       "of up to about 100,000 atoms.[PAR]",
-       " * In order to take a full advantage of the GPU platform features, many algorithms",
-       "have been implemented in a very different way than they are on the CPUs.",
-       "Therefore numercal correspondence between properties of the state of",
-       "simulated systems should not be expected. Moreover, the values will likely vary",
-       "when simulations are done on different GPU hardware.[PAR]",
-       " * Frequent retrieval of system state information such as",
-       "trajectory coordinates and energies can greatly influence the performance",
-       "of the program due to slow CPU<->GPU memory transfer speed.[PAR]",
-       " * MD algorithms are complex, and although the Gromacs code is highly tuned for them,",
-       "they often do not translate very well onto the streaming architetures.",
-       "Realistic expectations about the achievable speed-up from test with GTX280:",
-       "For small protein systems in implicit solvent using all-vs-all kernels the acceleration",
-       "can be as high as 20 times, but in most other setups involving cutoffs and PME the",
-       "acceleration is usually only ~4 times relative to a 3GHz CPU.[PAR]",
-       "Supported features:[PAR]",
-       " * Integrators: md/md-vv/md-vv-avek, sd/sd1 and bd.\n",
-       " * Long-range interactions (option coulombtype): Reaction-Field, Ewald, PME, and cut-off (for Implicit Solvent only)\n",
-       " * Temperature control: Supported only with the md/md-vv/md-vv-avek, sd/sd1 and bd integrators.\n",
-       " * Pressure control: Supported.\n",
-       " * Implicit solvent: Supported.\n",
-       "A detailed description can be found on the GROMACS website:\n",
-       "http://www.gromacs.org/gpu[PAR]",
- /* From the original mdrun documentaion */
-     "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
-     "and distributes the topology over nodes if needed.",
-     "[TT]mdrun[tt] produces at least four output files.",
-     "A single log file ([TT]-g[tt]) is written, unless the option",
-     "[TT]-seppot[tt] is used, in which case each node writes a log file.",
-     "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
-     "optionally forces.",
-     "The structure file ([TT]-c[tt]) contains the coordinates and",
-     "velocities of the last step.",
-     "The energy file ([TT]-e[tt]) contains energies, the temperature,",
-     "pressure, etc, a lot of these things are also printed in the log file.",
-     "Optionally coordinates can be written to a compressed trajectory file",
-     "([TT]-x[tt]).[PAR]",
- /* openmm specific information */
-       "Usage with OpenMM:[BR]",
-       "[TT]mdrun -device \"OpenMM:platform=Cuda,memtest=15,deviceid=0,force-device=no\"[tt][PAR]",
-       "Options:[PAR]",
-       "      [TT]platform[tt] = Cuda\t\t:\tThe only available value. OpenCL support will be available in future.\n",
-       "      [TT]memtest[tt] = 15\t\t:\tRun a partial, random GPU memory test for the given amount of seconds. A full test",
-       "(recommended!) can be run with \"memtest=full\". Memory testing can be disabled with \"memtest=off\".\n",
-       "      [TT]deviceid[tt] = 0\t\t:\tSpecify the target device when multiple cards are present.",
-       "Only one card can be used at any given time though.\n",
-       "      [TT]force-device[tt] = no\t\t:\tIf set to \"yes\" [TT]mdrun[tt]  will be forced to execute on",
-       "hardware that is not officially supported. GPU acceleration can also be achieved on older",
-       "but Cuda capable cards, although the simulation might be too slow, and the memory limits too strict.",
- #else
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "copyrite.h"
 +#include "main.h"
 +#include "statutil.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "smalloc.h"
 +#include "edsam.h"
 +#include "mdrun.h"
 +#include "xmdrun.h"
 +#include "checkpoint.h"
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi.h"
 +#endif
 +
 +/* afm stuf */
 +#include "pull.h"
 +
 +int cmain(int argc,char *argv[])
 +{
 +  const char *desc[] = {
-     "ED (essential dynamics) sampling is switched on by using the [TT]-ei[tt]",
-     "flag followed by an [TT].edi[tt] file.",
-     "The [TT].edi[tt] file can be produced using options in the essdyn",
-     "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
 +    "The [TT]mdrun[tt] program is the main computational chemistry engine",
 +    "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
 +    "but it can also perform Stochastic Dynamics, Energy Minimization,",
 +    "test particle insertion or (re)calculation of energies.",
 +    "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
 +    "builds a Hessian matrix from single conformation.",
 +    "For usual Normal Modes-like calculations, make sure that",
 +    "the structure provided is properly energy-minimized.",
 +    "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
 +    "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
 +    "and distributes the topology over nodes if needed.",
 +    "[TT]mdrun[tt] produces at least four output files.",
 +    "A single log file ([TT]-g[tt]) is written, unless the option",
 +    "[TT]-seppot[tt] is used, in which case each node writes a log file.",
 +    "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
 +    "optionally forces.",
 +    "The structure file ([TT]-c[tt]) contains the coordinates and",
 +    "velocities of the last step.",
 +    "The energy file ([TT]-e[tt]) contains energies, the temperature,",
 +    "pressure, etc, a lot of these things are also printed in the log file.",
 +    "Optionally coordinates can be written to a compressed trajectory file",
 +    "([TT]-x[tt]).[PAR]",
 +    "The option [TT]-dhdl[tt] is only used when free energy calculation is",
 +    "turned on.[PAR]",
 +    "A simulation can be run in parallel using two different parallelization",
 +    "schemes: MPI parallelization and/or OpenMP thread parallelization.",
 +    "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
 +    "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
 +    "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
 +    "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
 +    "is only available with the Verlet cut-off scheme, with the (older)",
 +    "group scheme only PME-only processes can use OpenMP parallelization.",
 +    "In all cases [TT]mdrun[tt] will by default try to use all the available",
 +    "hardware resources. With a normal MPI library only the options",
 +    "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
 +    "for PME-only processes, can be used to control the number of threads.",
 +    "With thread-MPI there are additional options [TT]-nt[tt], which sets",
 +    "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
 +    "of thread-MPI threads.",
 +    "Note that using combined MPI+OpenMP parallelization is almost always",
 +    "slower than single parallelization, except at the scaling limit, where",
 +    "especially OpenMP parallelization of PME reduces the communication cost.",
 +    "OpenMP-only parallelization is much faster than MPI-only parallelization",
 +    "on a single CPU(-die). Since we currently don't have proper hardware",
 +    "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
 +    "automatically use OpenMP-only parallelization when you use up to 4",
 +    "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
 +    "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
 +    "parallelization is used (except with GPUs, see below).",
 +    "[PAR]",
 +    "To quickly test the performance of the new Verlet cut-off scheme",
 +    "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
 +    "the [TT]-testverlet[tt] option. This should not be used for production,",
 +    "since it can slightly modify potentials and it will remove charge groups",
 +    "making analysis difficult, as the [TT].tpr[tt] file will still contain",
 +    "charge groups. For production simulations it is highly recommended",
 +    "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
 +    "[PAR]",
 +    "With GPUs (only supported with the Verlet cut-off scheme), the number",
 +    "of GPUs should match the number of MPI processes or MPI threads,",
 +    "excluding PME-only processes/threads. With thread-MPI the number",
 +    "of MPI threads will automatically be set to the number of GPUs detected.",
 +    "When you want to use a subset of the available GPUs, you can use",
 +    "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
 +    "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
 +    "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
 +    "variable instead. The format for GMX_GPU_ID is identical to ",
 +    "[TT]-gpu_id[tt], but an environment variable can have different values",
 +    "on different nodes of a cluster.",
 +    "[PAR]",
 +    "When using PME with separate PME nodes or with a GPU, the two major",
 +    "compute tasks, the non-bonded force calculation and the PME calculation",
 +    "run on different compute resources. If this load is not balanced,",
 +    "some of the resources will be idle part of time. With the Verlet",
 +    "cut-off scheme this load is automatically balanced when the PME load",
 +    "is too high (but not when it is too low). This is done by scaling",
 +    "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
 +    "few hundred steps different settings are tried and the fastest is chosen",
 +    "for the rest of the simulation. This does not affect the accuracy of",
 +    "the results, but it does affect the decomposition of the Coulomb energy",
 +    "into particle and mesh contributions. The auto-tuning can be turned off",
 +    "with the option [TT]-notunepme[tt].",
 +    "[PAR]",
 +    "When compiled with OpenMP on Linux, [TT]mdrun[tt] pins threads to cores,",
 +    "as this usually results in significantly better performance.",
 +    "If you don't want this, use [TT]-nopin[tt].",
 +    "With Intel CPUs with hyper-threading enabled, you should pin",
 +    "consecutive threads to the same physical core for optimal",
 +    "performance when you use virtual cores. This is done automatically",
 +    "when you use more than half of the virtual cores. It can also be set",
 +    "manually with [TT]-pinht[tt], e.g. for running multiple simulations",
 +    "on one compute node.",
 +    "When running multiple mdrun (or other) simulations on the same physical",
 +    "node, some simulations need to start pinning from a non-zero core",
 +    "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
 +    "the offset in (physical) cores for pinning.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] is started using MPI with more than 1 process",
 +    "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
 +    "By default domain decomposition is used, unless the [TT]-pd[tt]",
 +    "option is set, which selects particle decomposition.",
 +    "[PAR]",
 +    "With domain decomposition, the spatial decomposition can be set",
 +    "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
 +    "The user only needs to change this when the system is very inhomogeneous.",
 +    "Dynamic load balancing is set with the option [TT]-dlb[tt],",
 +    "which can give a significant performance improvement,",
 +    "especially for inhomogeneous systems. The only disadvantage of",
 +    "dynamic load balancing is that runs are no longer binary reproducible,",
 +    "but in most cases this is not important.",
 +    "By default the dynamic load balancing is automatically turned on",
 +    "when the measured performance loss due to load imbalance is 5% or more.",
 +    "At low parallelization these are the only important options",
 +    "for domain decomposition.",
 +    "At high parallelization the options in the next two sections",
 +    "could be important for increasing the performace.",
 +    "[PAR]",
 +    "When PME is used with domain decomposition, separate nodes can",
 +    "be assigned to do only the PME mesh calculation;",
 +    "this is computationally more efficient starting at about 12 nodes.",
 +    "The number of PME nodes is set with option [TT]-npme[tt],",
 +    "this can not be more than half of the nodes.",
 +    "By default [TT]mdrun[tt] makes a guess for the number of PME",
 +    "nodes when the number of nodes is larger than 11 or performance wise",
 +    "not compatible with the PME grid x dimension.",
 +    "But the user should optimize npme. Performance statistics on this issue",
 +    "are written at the end of the log file.",
 +    "For good load balancing at high parallelization, the PME grid x and y",
 +    "dimensions should be divisible by the number of PME nodes",
 +    "(the simulation will run correctly also when this is not the case).",
 +    "[PAR]",
 +    "This section lists all options that affect the domain decomposition.",
 +    "[PAR]",
 +    "Option [TT]-rdd[tt] can be used to set the required maximum distance",
 +    "for inter charge-group bonded interactions.",
 +    "Communication for two-body bonded interactions below the non-bonded",
 +    "cut-off distance always comes for free with the non-bonded communication.",
 +    "Atoms beyond the non-bonded cut-off are only communicated when they have",
 +    "missing bonded interactions; this means that the extra cost is minor",
 +    "and nearly indepedent of the value of [TT]-rdd[tt].",
 +    "With dynamic load balancing option [TT]-rdd[tt] also sets",
 +    "the lower limit for the domain decomposition cell sizes.",
 +    "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
 +    "the initial coordinates. The chosen value will be a balance",
 +    "between interaction range and communication cost.",
 +    "[PAR]",
 +    "When inter charge-group bonded interactions are beyond",
 +    "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
 +    "For pair interactions and tabulated bonds",
 +    "that do not generate exclusions, this check can be turned off",
 +    "with the option [TT]-noddcheck[tt].",
 +    "[PAR]",
 +    "When constraints are present, option [TT]-rcon[tt] influences",
 +    "the cell size limit as well.",
 +    "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
 +    "should not be beyond the smallest cell size. A error message is",
 +    "generated when this happens and the user should change the decomposition",
 +    "or decrease the LINCS order and increase the number of LINCS iterations.",
 +    "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
 +    "in a conservative fashion. For high parallelization it can be useful",
 +    "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
 +    "[PAR]",
 +    "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
 +    "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
 +    "the cells can scale down by at least this factor. This option is used",
 +    "for the automated spatial decomposition (when not using [TT]-dd[tt])",
 +    "as well as for determining the number of grid pulses, which in turn",
 +    "sets the minimum allowed cell size. Under certain circumstances",
 +    "the value of [TT]-dds[tt] might need to be adjusted to account for",
 +    "high or low spatial inhomogeneity of the system.",
 +    "[PAR]",
 +    "The option [TT]-gcom[tt] can be used to only do global communication",
 +    "every n steps.",
 +    "This can improve performance for highly parallel simulations",
 +    "where this global communication step becomes the bottleneck.",
 +    "For a global thermostat and/or barostat the temperature",
 +    "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
 +    "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
 +    "With [TT]-rerun[tt] an input trajectory can be given for which ",
 +    "forces and energies will be (re)calculated. Neighbor searching will be",
 +    "performed for every frame, unless [TT]nstlist[tt] is zero",
 +    "(see the [TT].mdp[tt] file).[PAR]",
- #endif
++    "ED (essential dynamics) sampling and/or additional flooding potentials",
++    "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
++    "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
++    "or by using options in the essdyn menu of the WHAT IF program.",
++    "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
 +    "contains projections of positions, velocities and forces onto selected",
 +    "eigenvectors.[PAR]",
 +    "When user-defined potential functions have been selected in the",
 +    "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
 +    "a formatted table with potential functions. The file is read from",
 +    "either the current directory or from the [TT]GMXLIB[tt] directory.",
 +    "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
 +    "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
 +    "normal Coulomb.",
 +    "When pair interactions are present, a separate table for pair interaction",
 +    "functions is read using the [TT]-tablep[tt] option.[PAR]",
 +    "When tabulated bonded functions are present in the topology,",
 +    "interaction functions are read using the [TT]-tableb[tt] option.",
 +    "For each different tabulated interaction type the table file name is",
 +    "modified in a different way: before the file extension an underscore is",
 +    "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
 +    "and finally the table number of the interaction type.[PAR]",
 +    "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
 +    "coordinates and forces when pulling is selected",
 +    "in the [TT].mdp[tt] file.[PAR]",
 +    "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
 +    "simulated in parallel.",
 +    "As many input files/directories are required as the number of systems. ",
 +    "The [TT]-multidir[tt] option takes a list of directories (one for each ",
 +    "system) and runs in each of them, using the input/output file names, ",
 +    "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
 +    "directories.",
 +    "With [TT]-multi[tt], the system number is appended to the run input ",
 +    "and each output filename, for instance [TT]topol.tpr[tt] becomes",
 +    "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
 +    "The number of nodes per system is the total number of nodes",
 +    "divided by the number of systems.",
 +    "One use of this option is for NMR refinement: when distance",
 +    "or orientation restraints are present these can be ensemble averaged",
 +    "over all the systems.[PAR]",
 +    "With [TT]-replex[tt] replica exchange is attempted every given number",
 +    "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
 +    "[TT]-multidir[tt] option, described above.",
 +    "All run input files should use a different coupling temperature,",
 +    "the order of the files is not important. The random seed is set with",
 +    "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
 +    "is performed after every exchange.[PAR]",
 +    "Finally some experimental algorithms can be tested when the",
 +    "appropriate options have been given. Currently under",
 +    "investigation are: polarizability and X-ray bombardments.",
 +    "[PAR]",
 +    "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
 +    "a protein into a membrane. The data file should contain the options",
 +    "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
 +    "both apply to this as well.",
 +    "[PAR]",
 +    "The option [TT]-pforce[tt] is useful when you suspect a simulation",
 +    "crashes due to too large forces. With this option coordinates and",
 +    "forces of atoms with a force larger than a certain value will",
 +    "be printed to stderr.",
 +    "[PAR]",
 +    "Checkpoints containing the complete state of the system are written",
 +    "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
 +    "unless option [TT]-cpt[tt] is set to -1.",
 +    "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
 +    "make sure that a recent state of the system is always available,",
 +    "even when the simulation is terminated while writing a checkpoint.",
 +    "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
 +    "with the step number.",
 +    "A simulation can be continued by reading the full state from file",
 +    "with option [TT]-cpi[tt]. This option is intelligent in the way that",
 +    "if no checkpoint file is found, Gromacs just assumes a normal run and",
 +    "starts from the first step of the [TT].tpr[tt] file. By default the output",
 +    "will be appending to the existing output files. The checkpoint file",
 +    "contains checksums of all output files, such that you will never",
 +    "loose data when some output files are modified, corrupt or removed.",
 +    "There are three scenarios with [TT]-cpi[tt]:[PAR]",
 +    "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
 +    "[TT]*[tt] all files are present with names and checksums matching those stored",
 +    "in the checkpoint file: files are appended[PAR]",
 +    "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
 +    "With [TT]-noappend[tt] new output files are opened and the simulation",
 +    "part number is added to all output file names.",
 +    "Note that in all cases the checkpoint file itself is not renamed",
 +    "and will be overwritten, unless its name does not match",
 +    "the [TT]-cpo[tt] option.",
 +    "[PAR]",
 +    "With checkpointing the output is appended to previously written",
 +    "output files, unless [TT]-noappend[tt] is used or none of the previous",
 +    "output files are present (except for the checkpoint file).",
 +    "The integrity of the files to be appended is verified using checksums",
 +    "which are stored in the checkpoint file. This ensures that output can",
 +    "not be mixed up or corrupted due to file appending. When only some",
 +    "of the previous output files are present, a fatal error is generated",
 +    "and no old output files are modified and no new output files are opened.",
 +    "The result with appending will be the same as from a single run.",
 +    "The contents will be binary identical, unless you use a different number",
 +    "of nodes or dynamic load balancing or the FFT library uses optimizations",
 +    "through timing.",
 +    "[PAR]",
 +    "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
 +    "file is written at the first neighbor search step where the run time",
 +    "exceeds [TT]-maxh[tt]*0.99 hours.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
 +    "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
 +    "pressed), it will stop after the next neighbor search step ",
 +    "(with nstlist=0 at the next step).",
 +    "In both cases all the usual output will be written to file.",
 +    "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
 +    "is sufficient, this signal should not be sent to mpirun or",
 +    "the [TT]mdrun[tt] process that is the parent of the others.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
- #ifdef GMX_OPENMM
-     ,
-     { "-device",  FALSE, etSTR, {&deviceOptions},
-       "Device option string" }
- #endif
 +  };
 +  t_commrec    *cr;
 +  t_filenm fnm[] = {
 +    { efTPX, NULL,      NULL,       ffREAD },
 +    { efTRN, "-o",      NULL,       ffWRITE },
 +    { efXTC, "-x",      NULL,       ffOPTWR },
 +    { efCPT, "-cpi",    NULL,       ffOPTRD },
 +    { efCPT, "-cpo",    NULL,       ffOPTWR },
 +    { efSTO, "-c",      "confout",  ffWRITE },
 +    { efEDR, "-e",      "ener",     ffWRITE },
 +    { efLOG, "-g",      "md",       ffWRITE },
 +    { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
 +    { efXVG, "-field",  "field",    ffOPTWR },
 +    { efXVG, "-table",  "table",    ffOPTRD },
 +    { efXVG, "-tabletf", "tabletf",    ffOPTRD },
 +    { efXVG, "-tablep", "tablep",   ffOPTRD },
 +    { efXVG, "-tableb", "table",    ffOPTRD },
 +    { efTRX, "-rerun",  "rerun",    ffOPTRD },
 +    { efXVG, "-tpi",    "tpi",      ffOPTWR },
 +    { efXVG, "-tpid",   "tpidist",  ffOPTWR },
 +    { efEDI, "-ei",     "sam",      ffOPTRD },
 +    { efXVG, "-eo",     "edsam",    ffOPTWR },
 +    { efGCT, "-j",      "wham",     ffOPTRD },
 +    { efGCT, "-jo",     "bam",      ffOPTWR },
 +    { efXVG, "-ffout",  "gct",      ffOPTWR },
 +    { efXVG, "-devout", "deviatie", ffOPTWR },
 +    { efXVG, "-runav",  "runaver",  ffOPTWR },
 +    { efXVG, "-px",     "pullx",    ffOPTWR },
 +    { efXVG, "-pf",     "pullf",    ffOPTWR },
 +    { efXVG, "-ro",     "rotation", ffOPTWR },
 +    { efLOG, "-ra",     "rotangles",ffOPTWR },
 +    { efLOG, "-rs",     "rotslabs", ffOPTWR },
 +    { efLOG, "-rt",     "rottorque",ffOPTWR },
 +    { efMTX, "-mtx",    "nm",       ffOPTWR },
 +    { efNDX, "-dn",     "dipole",   ffOPTWR },
 +    { efRND, "-multidir",NULL,      ffOPTRDMULT},
 +    { efDAT, "-membed", "membed",   ffOPTRD },
 +    { efTOP, "-mp",     "membed",   ffOPTRD },
 +    { efNDX, "-mn",     "membed",   ffOPTRD }
 +  };
 +#define NFILE asize(fnm)
 +
 +  /* Command line options ! */
 +  gmx_bool bCart        = FALSE;
 +  gmx_bool bPPPME       = FALSE;
 +  gmx_bool bPartDec     = FALSE;
 +  gmx_bool bDDBondCheck = TRUE;
 +  gmx_bool bDDBondComm  = TRUE;
 +  gmx_bool bTunePME     = TRUE;
 +  gmx_bool bTestVerlet  = FALSE;
 +  gmx_bool bVerbose     = FALSE;
 +  gmx_bool bCompact     = TRUE;
 +  gmx_bool bSepPot      = FALSE;
 +  gmx_bool bRerunVSite  = FALSE;
 +  gmx_bool bIonize      = FALSE;
 +  gmx_bool bConfout     = TRUE;
 +  gmx_bool bReproducible = FALSE;
 +    
 +  int  npme=-1;
 +  int  nmultisim=0;
 +  int  nstglobalcomm=-1;
 +  int  repl_ex_nst=0;
 +  int  repl_ex_seed=-1;
 +  int  repl_ex_nex=0;
 +  int  nstepout=100;
 +  int  resetstep=-1;
 +  int  nsteps=-2; /* the value -2 means that the mdp option will be used */
 +  
 +  rvec realddxyz={0,0,0};
 +  const char *ddno_opt[ddnoNR+1] =
 +    { NULL, "interleave", "pp_pme", "cartesian", NULL };
 +  const char *dddlb_opt[] =
 +    { NULL, "auto", "no", "yes", NULL };
 +  const char *nbpu_opt[] =
 +    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
 +  real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
 +  char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL;
 +  real cpt_period=15.0,max_hours=-1;
 +  gmx_bool bAppendFiles=TRUE;
 +  gmx_bool bKeepAndNumCPT=FALSE;
 +  gmx_bool bResetCountersHalfWay=FALSE;
 +  output_env_t oenv=NULL;
 +  const char *deviceOptions = "";
 +
 +  gmx_hw_opt_t hw_opt={0,0,0,0,TRUE,FALSE,0,NULL};
 +
 +  t_pargs pa[] = {
 +
 +    { "-pd",      FALSE, etBOOL,{&bPartDec},
 +      "Use particle decompostion" },
 +    { "-dd",      FALSE, etRVEC,{&realddxyz},
 +      "Domain decomposition grid, 0 is optimize" },
 +    { "-ddorder", FALSE, etENUM, {ddno_opt},
 +      "DD node order" },
 +    { "-npme",    FALSE, etINT, {&npme},
 +      "Number of separate nodes to be used for PME, -1 is guess" },
 +    { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
 +      "Total number of threads to start (0 is guess)" },
 +    { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
 +      "Number of thread-MPI threads to start (0 is guess)" },
 +    { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
 +      "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
 +    { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
 +      "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
 +    { "-pin",     FALSE, etBOOL, {&hw_opt.bThreadPinning},
 +      "Pin OpenMP threads to cores" },
 +    { "-pinht",   FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
 +      "Always pin threads to Hyper-Threading cores" },
 +    { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
 +      "Core offset for pinning (for running multiple mdrun processes on a single physical node)" },
 +    { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
 +      "List of GPU id's to use" },
 +    { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
 +      "Check for all bonded interactions with DD" },
 +    { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
 +      "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
 +    { "-rdd",     FALSE, etREAL, {&rdd},
 +      "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
 +    { "-rcon",    FALSE, etREAL, {&rconstr},
 +      "Maximum distance for P-LINCS (nm), 0 is estimate" },
 +    { "-dlb",     FALSE, etENUM, {dddlb_opt},
 +      "Dynamic load balancing (with DD)" },
 +    { "-dds",     FALSE, etREAL, {&dlb_scale},
 +      "Minimum allowed dlb scaling of the DD cell size" },
 +    { "-ddcsx",   FALSE, etSTR, {&ddcsx},
 +      "HIDDENThe DD cell sizes in x" },
 +    { "-ddcsy",   FALSE, etSTR, {&ddcsy},
 +      "HIDDENThe DD cell sizes in y" },
 +    { "-ddcsz",   FALSE, etSTR, {&ddcsz},
 +      "HIDDENThe DD cell sizes in z" },
 +    { "-gcom",    FALSE, etINT,{&nstglobalcomm},
 +      "Global communication frequency" },
 +    { "-nb",      FALSE, etENUM, {&nbpu_opt},
 +      "Calculate non-bonded interactions on" },
 +    { "-tunepme", FALSE, etBOOL, {&bTunePME},  
 +      "Optimize PME load between PP/PME nodes or GPU/CPU" },
 +    { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
 +      "Test the Verlet non-bonded scheme" },
 +    { "-v",       FALSE, etBOOL,{&bVerbose},  
 +      "Be loud and noisy" },
 +    { "-compact", FALSE, etBOOL,{&bCompact},  
 +      "Write a compact log file" },
 +    { "-seppot",  FALSE, etBOOL, {&bSepPot},
 +      "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
 +    { "-pforce",  FALSE, etREAL, {&pforce},
 +      "Print all forces larger than this (kJ/mol nm)" },
 +    { "-reprod",  FALSE, etBOOL,{&bReproducible},  
 +      "Try to avoid optimizations that affect binary reproducibility" },
 +    { "-cpt",     FALSE, etREAL, {&cpt_period},
 +      "Checkpoint interval (minutes)" },
 +    { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
 +      "Keep and number checkpoint files" },
 +    { "-append",  FALSE, etBOOL, {&bAppendFiles},
 +      "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
 +    { "-nsteps",  FALSE, etINT, {&nsteps},
 +      "Run this number of steps, overrides .mdp file option" },
 +    { "-maxh",   FALSE, etREAL, {&max_hours},
 +      "Terminate after 0.99 times this time (hours)" },
 +    { "-multi",   FALSE, etINT,{&nmultisim}, 
 +      "Do multiple simulations in parallel" },
 +    { "-replex",  FALSE, etINT, {&repl_ex_nst}, 
 +      "Attempt replica exchange periodically with this period (steps)" },
 +    { "-nex",  FALSE, etINT, {&repl_ex_nex},
 +      "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
 +    { "-reseed",  FALSE, etINT, {&repl_ex_seed}, 
 +      "Seed for replica exchange, -1 is generate a seed" },
 +    { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
 +      "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
 +    { "-ionize",  FALSE, etBOOL,{&bIonize},
 +      "Do a simulation including the effect of an X-Ray bombardment on your system" },
 +    { "-confout", FALSE, etBOOL, {&bConfout},
 +      "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
 +    { "-stepout", FALSE, etINT, {&nstepout},
 +      "HIDDENFrequency of writing the remaining runtime" },
 +    { "-resetstep", FALSE, etINT, {&resetstep},
 +      "HIDDENReset cycle counters after these many time steps" },
 +    { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
 +      "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
-   FILE     *fplog,*fptest;
 +  };
 +  gmx_edsam_t  ed;
 +  unsigned long Flags, PCA_Flags;
 +  ivec     ddxyz;
 +  int      dd_node_order;
 +  gmx_bool     bAddPart;
-       if (sim_part_fn==0 && MASTER(cr))
++  FILE     *fplog,*fpmulti;
 +  int      sim_part,sim_part_fn;
 +  const char *part_suffix=".part";
 +  char     suffix[STRLEN];
 +  int      rc;
 +  char **multidir=NULL;
 +
 +
 +  cr = init_par(&argc,&argv);
 +
 +  if (MASTER(cr))
 +    CopyRight(stderr, argv[0]);
 +
 +  PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
 +  
 +  /* Comment this in to do fexist calls only on master
 +   * works not with rerun or tables at the moment
 +   * also comment out the version of init_forcerec in md.c 
 +   * with NULL instead of opt2fn
 +   */
 +  /*
 +     if (!MASTER(cr))
 +     {
 +     PCA_Flags |= PCA_NOT_READ_NODE;
 +     }
 +     */
 +
 +  parse_common_args(&argc,argv,PCA_Flags, NFILE,fnm,asize(pa),pa,
 +                    asize(desc),desc,0,NULL, &oenv);
 +
 +
 +
 +  /* we set these early because they might be used in init_multisystem() 
 +     Note that there is the potential for npme>nnodes until the number of
 +     threads is set later on, if there's thread parallelization. That shouldn't
 +     lead to problems. */ 
 +  dd_node_order = nenum(ddno_opt);
 +  cr->npmenodes = npme;
 +
 +  /* now check the -multi and -multidir option */
 +  if (opt2bSet("-multidir", NFILE, fnm))
 +  {
 +      int i;
 +      if (nmultisim > 0)
 +      {
 +          gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
 +      }
 +      nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
 +  }
 +
 +
 +  if (repl_ex_nst != 0 && nmultisim < 2)
 +      gmx_fatal(FARGS,"Need at least two replicas for replica exchange (option -multi)");
 +
 +  if (repl_ex_nex < 0)
 +      gmx_fatal(FARGS,"Replica exchange number of exchanges needs to be positive");
 +
 +  if (nmultisim > 1) {
 +#ifndef GMX_THREAD_MPI
 +    gmx_bool bParFn = (multidir == NULL);
 +    init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 +#else
 +    gmx_fatal(FARGS,"mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
 +#endif
 +  }
 +
 +  bAddPart = !bAppendFiles;
 +
 +  /* Check if there is ANY checkpoint file available */       
 +  sim_part    = 1;
 +  sim_part_fn = sim_part;
 +  if (opt2bSet("-cpi",NFILE,fnm))
 +  {
 +      if (bSepPot && bAppendFiles)
 +      {
 +          gmx_fatal(FARGS,"Output file appending is not supported with -seppot");
 +      }
 +
 +      bAppendFiles =
 +                read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
 +                                                              fnm,cr),
 +                                                &sim_part_fn,NULL,cr,
 +                                                bAppendFiles,NFILE,fnm,
 +                                                part_suffix,&bAddPart);
-           check_multi_int(stdout,cr->ms,sim_part,"simulation part");
++      if (sim_part_fn==0 && MULTIMASTER(cr))
 +      {
 +          fprintf(stdout,"No previous checkpoint file present, assuming this is a new run.\n");
 +      }
 +      else
 +      {
 +          sim_part = sim_part_fn + 1;
 +      }
 +
 +      if (MULTISIM(cr) && MASTER(cr))
 +      {
-       if (MASTER(cr))
++          if (MULTIMASTER(cr))
++          {
++              /* Log file is not yet available, so if there's a
++               * problem we can only write to stderr. */
++              fpmulti = stderr;
++          }
++          else
++          {
++              fpmulti = NULL;
++          }
++          check_multi_int(fpmulti,cr->ms,sim_part,"simulation part",TRUE);
 +      }
 +  } 
 +  else
 +  {
 +      bAppendFiles = FALSE;
 +  }
 +
 +  if (!bAppendFiles)
 +  {
 +      sim_part_fn = sim_part;
 +  }
 +
 +  if (bAddPart)
 +  {
 +      /* Rename all output files (except checkpoint files) */
 +      /* create new part name first (zero-filled) */
 +      sprintf(suffix,"%s%04d",part_suffix,sim_part_fn);
 +
 +      add_suffix_to_output_names(fnm,NFILE,suffix);
++      if (MULTIMASTER(cr))
 +      {
 +          fprintf(stdout,"Checkpoint file is from part %d, new output files will be suffixed '%s'.\n",sim_part-1,suffix);
 +      }
 +  }
 +
 +  Flags = opt2bSet("-rerun",NFILE,fnm) ? MD_RERUN : 0;
 +  Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
 +  Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
 +  Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
 +  Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
 +  Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
 +  Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
 +  Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
 +  Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
 +  Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
 +  Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
 +  Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0); 
 +  Flags = Flags | (opt2parg_bSet("-append", asize(pa),pa) ? MD_APPENDFILESSET : 0); 
 +  Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0); 
 +  Flags = Flags | (sim_part>1    ? MD_STARTFROMCPT : 0); 
 +  Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
 +
 +
 +  /* We postpone opening the log file if we are appending, so we can 
 +     first truncate the old log file and append to the correct position 
 +     there instead.  */
 +  if ((MASTER(cr) || bSepPot) && !bAppendFiles) 
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,
 +                   !bSepPot,Flags & MD_APPENDFILES,&fplog);
 +      CopyRight(fplog,argv[0]);
 +      please_cite(fplog,"Hess2008b");
 +      please_cite(fplog,"Spoel2005a");
 +      please_cite(fplog,"Lindahl2001a");
 +      please_cite(fplog,"Berendsen95a");
 +  }
 +  else if (!MASTER(cr) && bSepPot)
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
 +  }
 +  else
 +  {
 +      fplog = NULL;
 +  }
 +
 +  ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
 +  ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
 +  ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
 +
 +  rc = mdrunner(&hw_opt, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact,
 +                nstglobalcomm, ddxyz,dd_node_order,rdd,rconstr,
 +                dddlb_opt[0],dlb_scale,ddcsx,ddcsy,ddcsz,
 +                nbpu_opt[0],
 +                nsteps,nstepout,resetstep,
 +                nmultisim,repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                pforce, cpt_period,max_hours,deviceOptions,Flags);
 +
 +  gmx_finalize_par();
 +
 +  if (MULTIMASTER(cr)) {
 +      thanx(stderr);
 +  }
 +
 +  /* Log file has to be closed in mdrunner if we are appending to it 
 +     (fplog not set here) */
 +  if (MASTER(cr) && !bAppendFiles) 
 +  {
 +      gmx_log_close(fplog);
 +  }
 +
 +  return rc;
 +}
 +
index 7037c7aa8e0a7f634bef34917b70d225783b0b5f,0000000000000000000000000000000000000000..a079e16a6acf11933c38caf88604e43cd8d14b86
mode 100644,000000..100644
--- /dev/null
@@@ -1,1298 -1,0 +1,1294 @@@
- #ifdef GMX_OPENMM
-         gmx_input("Sorry, g_membed does not work with openmm.");
- #endif
 +/*
 + * $Id: mdrun.c,v 1.139.2.9 2009/05/04 16:13:29 hess Exp $
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <signal.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "macros.h"
 +#include "copyrite.h"
 +#include "main.h"
 +#include "futil.h"
 +#include "edsam.h"
 +#include "index.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "mtop_util.h"
 +#include "tpxio.h"
 +#include "string2.h"
 +#include "membed.h"
 +#include "pbc.h"
 +#include "readinp.h"
 +#include "readir.h"
 +
 +/* information about scaling center */
 +typedef struct {
 +    rvec    xmin;         /* smallest coordinates of all embedded molecules */
 +    rvec    xmax;         /* largest coordinates of all embedded molecules */
 +    rvec    *geom_cent;   /* scaling center of each independent molecule to embed */
 +    int     pieces;       /* number of molecules to embed independently */
 +    int     *nidx;        /* n atoms for every independent embedded molecule (index in subindex) */
 +    atom_id **subindex;   /* atomids for independent molecule *
 +                           * atoms of piece i run from subindex[i][0] to subindex[i][nidx[i]] */
 +} pos_ins_t;
 +
 +/* variables needed in do_md */
 +struct membed {
 +    int   it_xy;          /* number of iterations (steps) used to grow something in the xy-plane */
 +    int   it_z;           /* same, but for z */
 +    real  xy_step;        /* stepsize used during resize in xy-plane */
 +    real  z_step;         /* same, but in z */
 +    rvec  fac;            /* initial scaling of the molecule to grow into the membrane */
 +    rvec  *r_ins;         /* final coordinates of the molecule to grow  */
 +    pos_ins_t *pos_ins;   /* scaling center for each piece to embed */
 +};
 +
 +/* membrane related variables */
 +typedef struct {
 +    char      *name;     /* name of index group to embed molecule into (usually membrane) */
 +    t_block   mem_at;    /* list all atoms in membrane */
 +    int       nmol;      /* number of membrane molecules overlapping with the molecule to embed */
 +    int       *mol_id;   /* list of molecules in membrane that overlap with the molecule to embed */
 +    real      lip_area;  /* average area per lipid in membrane (only correct for homogeneous bilayers)*/
 +    real      zmin;      /* minimum z coordinate of membrane */
 +    real      zmax;      /* maximum z coordinate of membrane */
 +    real      zmed;      /* median z coordinate of membrane */
 +} mem_t;
 +
 +/* Lists all molecules in the membrane that overlap with the molecule to be embedded. *
 + * These will then be removed from the system */
 +typedef struct {
 +    int  nr;      /* number of molecules to remove */
 +    int  *mol;    /* list of molecule ids to remove */
 +    int  *block;  /* id of the molblock that the molecule to remove is part of */
 +} rm_t;
 +
 +/* Get the global molecule id, and the corresponding molecule type and id of the *
 + * molblock from the global atom nr. */
 +static int get_mol_id(int at, gmx_mtop_t  *mtop, int *type, int *block)
 +{
 +    int mol_id=0;
 +    int i;
 +    int atnr_mol;
 +    gmx_mtop_atomlookup_t alook;
 +
 +    alook = gmx_mtop_atomlookup_settle_init(mtop);
 +    gmx_mtop_atomnr_to_molblock_ind(alook,at,block,&mol_id,&atnr_mol);
 +    for(i=0;i<*block;i++)
 +    {
 +        mol_id += mtop->molblock[i].nmol;
 +    }
 +    *type = mtop->molblock[*block].type;
 +
 +    gmx_mtop_atomlookup_destroy(alook);
 +
 +    return mol_id;
 +}
 +
 +/* Get the id of the molblock from a global molecule id */
 +static int get_molblock(int mol_id,int nmblock,gmx_molblock_t *mblock)
 +{
 +    int i;
 +    int nmol=0;
 +
 +    for(i=0;i<nmblock;i++)
 +    {
 +        nmol+=mblock[i].nmol;
 +        if(mol_id<nmol)
 +        {
 +            return i;
 +        }
 +    }
 +
 +    gmx_fatal(FARGS,"mol_id %d larger than total number of molecules %d.\n",mol_id,nmol);
 +
 +    return -1;
 +}
 +
 +static int get_tpr_version(const char *infile)
 +{
 +    t_tpxheader  header;
 +    int          version,generation;
 +
 +    read_tpxheader(infile,&header,TRUE,&version,&generation);
 +
 +    return version;
 +}
 +
 +/* Get a list of all the molecule types that are present in a group of atoms. *
 + * Because all interaction within the group to embed are removed on the topology *
 + * level, if the same molecule type is found in another part of the system, these *
 + * would also be affected. Therefore we have to check if the embedded and rest group *
 + * share common molecule types. If so, membed will stop with an error. */
 +static int get_mtype_list(t_block *at, gmx_mtop_t *mtop, t_block *tlist)
 +{
 +    int i,j,nr,mol_id;
 +    int type=0,block=0;
 +    gmx_bool bNEW;
 +
 +    nr=0;
 +    snew(tlist->index,at->nr);
 +    for (i=0;i<at->nr;i++)
 +    {
 +        bNEW=TRUE;
 +        mol_id = get_mol_id(at->index[i],mtop,&type,&block);
 +        for(j=0;j<nr;j++)
 +        {
 +            if(tlist->index[j]==type)
 +            {
 +                bNEW=FALSE;
 +            }
 +        }
 +
 +        if(bNEW==TRUE)
 +        {
 +            tlist->index[nr]=type;
 +            nr++;
 +        }
 +    }
 +    srenew(tlist->index,nr);
 +    return nr;
 +}
 +
 +/* Do the actual check of the molecule types between embedded and rest group */
 +static void check_types(t_block *ins_at,t_block *rest_at,gmx_mtop_t *mtop)
 +{
 +    t_block        *ins_mtype,*rest_mtype;
 +    int            i,j;
 +
 +    snew(ins_mtype,1);
 +    snew(rest_mtype,1);
 +    ins_mtype->nr  = get_mtype_list(ins_at , mtop, ins_mtype );
 +    rest_mtype->nr = get_mtype_list(rest_at, mtop, rest_mtype);
 +
 +    for(i=0;i<ins_mtype->nr;i++)
 +    {
 +        for(j=0;j<rest_mtype->nr;j++)
 +        {
 +            if(ins_mtype->index[i]==rest_mtype->index[j])
 +            {
 +                gmx_fatal(FARGS,"Moleculetype %s is found both in the group to insert and the rest of the system.\n"
 +                          "1. Your *.ndx and *.top do not match\n"
 +                          "2. You are inserting some molecules of type %s (for example xray-solvent), while\n"
 +                          "the same moleculetype is also used in the rest of the system (solvent box). Because\n"
 +                          "we need to exclude all interactions between the atoms in the group to\n"
 +                          "insert, the same moleculetype can not be used in both groups. Change the\n"
 +                          "moleculetype of the molecules %s in the inserted group. Do not forget to provide\n"
 +                          "an appropriate *.itp file",*(mtop->moltype[rest_mtype->index[j]].name),
 +                          *(mtop->moltype[rest_mtype->index[j]].name),*(mtop->moltype[rest_mtype->index[j]].name));
 +            }
 +        }
 +    }
 +
 +    sfree(ins_mtype->index);
 +    sfree(rest_mtype->index);
 +    sfree(ins_mtype);
 +    sfree(rest_mtype);
 +}
 +
 +static void get_input(const char *membed_input, real *xy_fac, real *xy_max, real *z_fac, real *z_max,
 +                      int *it_xy, int *it_z, real *probe_rad, int *low_up_rm, int *maxwarn,
 +                      int *pieces, gmx_bool *bALLOW_ASYMMETRY)
 +{
 +    warninp_t wi;
 +    t_inpfile *inp;
 +    int       ninp;
 +
 +    wi = init_warning(TRUE,0);
 +
 +    inp = read_inpfile(membed_input, &ninp, NULL, wi);
 +    ITYPE ("nxy", *it_xy, 1000);
 +    ITYPE ("nz", *it_z, 0);
 +    RTYPE ("xyinit", *xy_fac, 0.5);
 +    RTYPE ("xyend", *xy_max, 1.0);
 +    RTYPE ("zinit", *z_fac, 1.0);
 +    RTYPE ("zend", *z_max, 1.0);
 +    RTYPE ("rad", *probe_rad, 0.22);
 +    ITYPE ("ndiff", *low_up_rm, 0);
 +    ITYPE ("maxwarn", *maxwarn, 0);
 +    ITYPE ("pieces", *pieces, 1);
 +    EETYPE("asymmetry", *bALLOW_ASYMMETRY, yesno_names);
 +
 +    write_inpfile(membed_input,ninp,inp,FALSE,wi);
 +}
 +
 +/* Obtain the maximum and minimum coordinates of the group to be embedded */
 +static int init_ins_at(t_block *ins_at,t_block *rest_at,t_state *state, pos_ins_t *pos_ins,
 +                       gmx_groups_t *groups,int ins_grp_id, real xy_max)
 +{
 +    int i,gid,c=0;
 +    real x,xmin,xmax,y,ymin,ymax,z,zmin,zmax;
 +    const real min_memthick=6.0;      /* minimum thickness of the bilayer that will be used to *
 +                                       * determine the overlap between molecule to embed and membrane */
 +    const real fac_inp_size=1.000001; /* scaling factor to obtain input_size + 0.000001 (comparing reals) */
 +    snew(rest_at->index,state->natoms);
 +
 +    xmin=xmax=state->x[ins_at->index[0]][XX];
 +    ymin=ymax=state->x[ins_at->index[0]][YY];
 +    zmin=zmax=state->x[ins_at->index[0]][ZZ];
 +
 +    for(i=0;i<state->natoms;i++)
 +    {
 +        gid = groups->grpnr[egcFREEZE][i];
 +        if(groups->grps[egcFREEZE].nm_ind[gid]==ins_grp_id)
 +        {
 +            x=state->x[i][XX];
 +            if (x<xmin)                xmin=x;
 +            if (x>xmax)                xmax=x;
 +            y=state->x[i][YY];
 +            if (y<ymin)                ymin=y;
 +            if (y>ymax)                ymax=y;
 +            z=state->x[i][ZZ];
 +            if (z<zmin)                zmin=z;
 +            if (z>zmax)                zmax=z;
 +        }
 +        else
 +        {
 +            rest_at->index[c]=i;
 +            c++;
 +        }
 +    }
 +
 +    rest_at->nr=c;
 +    srenew(rest_at->index,c);
 +
 +    if(xy_max>fac_inp_size)
 +    {
 +        pos_ins->xmin[XX]=xmin-((xmax-xmin)*xy_max-(xmax-xmin))/2;
 +        pos_ins->xmin[YY]=ymin-((ymax-ymin)*xy_max-(ymax-ymin))/2;
 +
 +        pos_ins->xmax[XX]=xmax+((xmax-xmin)*xy_max-(xmax-xmin))/2;
 +        pos_ins->xmax[YY]=ymax+((ymax-ymin)*xy_max-(ymax-ymin))/2;
 +    }
 +    else
 +    {
 +        pos_ins->xmin[XX]=xmin;
 +        pos_ins->xmin[YY]=ymin;
 +
 +        pos_ins->xmax[XX]=xmax;
 +        pos_ins->xmax[YY]=ymax;
 +    }
 +
 +    if( (zmax-zmin) < min_memthick )
 +    {
 +        pos_ins->xmin[ZZ]=zmin+(zmax-zmin)/2.0-0.5*min_memthick;
 +        pos_ins->xmax[ZZ]=zmin+(zmax-zmin)/2.0+0.5*min_memthick;
 +    }
 +    else
 +    {
 +        pos_ins->xmin[ZZ]=zmin;
 +        pos_ins->xmax[ZZ]=zmax;
 +    }
 +
 +    return c;
 +}
 +
 +/* Estimate the area of the embedded molecule by projecting all coordinates on a grid in the *
 + * xy-plane and counting the number of occupied grid points */
 +static real est_prot_area(pos_ins_t *pos_ins,rvec *r,t_block *ins_at, mem_t *mem_p)
 +{
 +    real x,y,dx=0.15,dy=0.15,area=0.0;
 +    real add,memmin,memmax;
 +    int c,at;
 +
 +    /* min and max membrane coordinate are altered to reduce the influence of the *
 +     * boundary region */
 +    memmin=mem_p->zmin+0.1*(mem_p->zmax-mem_p->zmin);
 +    memmax=mem_p->zmax-0.1*(mem_p->zmax-mem_p->zmin);
 +
 +    for(x=pos_ins->xmin[XX];x<pos_ins->xmax[XX];x+=dx)
 +    {
 +        for(y=pos_ins->xmin[YY];y<pos_ins->xmax[YY];y+=dy)
 +        {
 +            c=0;
 +            add=0.0;
 +            do
 +            {
 +                at=ins_at->index[c];
 +                if ( (r[at][XX]>=x) && (r[at][XX]<x+dx) &&
 +                     (r[at][YY]>=y) && (r[at][YY]<y+dy) &&
 +                     (r[at][ZZ]>memmin) && (r[at][ZZ]<memmax) )
 +                {
 +                    add=1.0;
 +                }
 +                c++;
 +            } while ( (c<ins_at->nr) && (add<0.5) );
 +            area+=add;
 +        }
 +    }
 +    area=area*dx*dy;
 +
 +    return area;
 +}
 +
 +static int init_mem_at(mem_t *mem_p, gmx_mtop_t *mtop, rvec *r, matrix box, pos_ins_t *pos_ins)
 +{
 +    int i,j,at,mol,nmol,nmolbox,count;
 +    t_block *mem_a;
 +    real z,zmin,zmax,mem_area;
 +    gmx_bool bNew;
 +    atom_id *mol_id;
 +    int type=0,block=0;
 +
 +    nmol=count=0;
 +    mem_a=&(mem_p->mem_at);
 +    snew(mol_id,mem_a->nr);
 +    zmin=pos_ins->xmax[ZZ];
 +    zmax=pos_ins->xmin[ZZ];
 +    for(i=0;i<mem_a->nr;i++)
 +    {
 +        at=mem_a->index[i];
 +        if( (r[at][XX]>pos_ins->xmin[XX]) && (r[at][XX]<pos_ins->xmax[XX]) &&
 +            (r[at][YY]>pos_ins->xmin[YY]) && (r[at][YY]<pos_ins->xmax[YY]) &&
 +            (r[at][ZZ]>pos_ins->xmin[ZZ]) && (r[at][ZZ]<pos_ins->xmax[ZZ]) )
 +        {
 +            mol = get_mol_id(at,mtop,&type,&block);
 +            bNew=TRUE;
 +            for(j=0;j<nmol;j++)
 +            {
 +                if(mol == mol_id[j])
 +                {
 +                    bNew=FALSE;
 +                }
 +            }
 +
 +            if(bNew)
 +            {
 +                mol_id[nmol]=mol;
 +                nmol++;
 +            }
 +
 +            z=r[at][ZZ];
 +            if(z<zmin)
 +            {
 +                zmin=z;
 +            }
 +
 +            if(z>zmax)
 +            {
 +                zmax=z;
 +            }
 +
 +            count++;
 +        }
 +    }
 +
 +    mem_p->nmol=nmol;
 +    srenew(mol_id,nmol);
 +    mem_p->mol_id=mol_id;
 +
 +    if((zmax-zmin)>(box[ZZ][ZZ]-0.5))
 +    {
 +        gmx_fatal(FARGS,"Something is wrong with your membrane. Max and min z values are %f and %f.\n"
 +                  "Maybe your membrane is not centered in the box, but located at the box edge in the z-direction,\n"
 +                  "so that one membrane is distributed over two periodic box images. Another possibility is that\n"
 +                  "your water layer is not thick enough.\n",zmax,zmin);
 +    }
 +    mem_p->zmin=zmin;
 +    mem_p->zmax=zmax;
 +    mem_p->zmed=(zmax-zmin)/2+zmin;
 +
 +    /*number of membrane molecules in protein box*/
 +    nmolbox = count/mtop->molblock[block].natoms_mol;
 +    /*membrane area within the box defined by the min and max coordinates of the embedded molecule*/
 +    mem_area = (pos_ins->xmax[XX]-pos_ins->xmin[XX])*(pos_ins->xmax[YY]-pos_ins->xmin[YY]);
 +    /*rough estimate of area per lipid, assuming there is only one type of lipid in the membrane*/
 +    mem_p->lip_area = 2.0*mem_area/(double)nmolbox;
 +
 +    return mem_p->mem_at.nr;
 +}
 +
 +static void init_resize(t_block *ins_at,rvec *r_ins,pos_ins_t *pos_ins,mem_t *mem_p,rvec *r,
 +                        gmx_bool bALLOW_ASYMMETRY)
 +{
 +    int i,j,at,c,outsidesum,gctr=0;
 +    int idxsum=0;
 +
 +    /*sanity check*/
 +    for (i=0;i<pos_ins->pieces;i++)
 +    {
 +        idxsum+=pos_ins->nidx[i];
 +    }
 +
 +    if (idxsum!=ins_at->nr)
 +    {
 +        gmx_fatal(FARGS,"Piecewise sum of inserted atoms not same as size of group selected to insert.");
 +    }
 +
 +    snew(pos_ins->geom_cent,pos_ins->pieces);
 +    for (i=0;i<pos_ins->pieces;i++)
 +    {
 +        c=0;
 +        outsidesum=0;
 +        for(j=0;j<DIM;j++)
 +        {
 +            pos_ins->geom_cent[i][j]=0;
 +        }
 +
 +        for (j=0;j<pos_ins->nidx[i];j++)
 +        {
 +            at=pos_ins->subindex[i][j];
 +            copy_rvec(r[at],r_ins[gctr]);
 +            if( (r_ins[gctr][ZZ]<mem_p->zmax) && (r_ins[gctr][ZZ]>mem_p->zmin) )
 +            {
 +                rvec_inc(pos_ins->geom_cent[i],r_ins[gctr]);
 +                c++;
 +            }
 +            else
 +            {
 +                outsidesum++;
 +            }
 +            gctr++;
 +        }
 +
 +        if (c>0)
 +        {
 +            svmul(1/(double)c,pos_ins->geom_cent[i],pos_ins->geom_cent[i]);
 +        }
 +
 +        if (!bALLOW_ASYMMETRY)
 +        {
 +            pos_ins->geom_cent[i][ZZ]=mem_p->zmed;
 +        }
 +
 +        fprintf(stderr,"Embedding piece %d with center of geometry: %f %f %f\n",
 +                i,pos_ins->geom_cent[i][XX],pos_ins->geom_cent[i][YY],pos_ins->geom_cent[i][ZZ]);
 +    }
 +    fprintf(stderr,"\n");
 +}
 +
 +/* resize performed in the md loop */
 +static void resize(rvec *r_ins, rvec *r, pos_ins_t *pos_ins,rvec fac)
 +{
 +    int i,j,k,at,c=0;
 +    for (k=0;k<pos_ins->pieces;k++)
 +    {
 +        for(i=0;i<pos_ins->nidx[k];i++)
 +        {
 +            at=pos_ins->subindex[k][i];
 +            for(j=0;j<DIM;j++)
 +            {
 +                r[at][j]=pos_ins->geom_cent[k][j]+fac[j]*(r_ins[c][j]-pos_ins->geom_cent[k][j]);
 +            }
 +            c++;
 +        }
 +    }
 +}
 +
 +/* generate the list of membrane molecules that overlap with the molecule to be embedded. *
 + * The molecule to be embedded is already reduced in size. */
 +static int gen_rm_list(rm_t *rm_p,t_block *ins_at,t_block *rest_at,t_pbc *pbc, gmx_mtop_t *mtop,
 +                       rvec *r, rvec *r_ins, mem_t *mem_p, pos_ins_t *pos_ins, real probe_rad,
 +                       int low_up_rm, gmx_bool bALLOW_ASYMMETRY)
 +{
 +    int i,j,k,l,at,at2,mol_id;
 +    int type=0,block=0;
 +    int nrm,nupper,nlower;
 +    real r_min_rad,z_lip,min_norm;
 +    gmx_bool bRM;
 +    rvec dr,dr_tmp;
 +    real *dist;
 +    int *order;
 +
 +    r_min_rad=probe_rad*probe_rad;
 +    snew(rm_p->mol,mtop->mols.nr);
 +    snew(rm_p->block,mtop->mols.nr);
 +    nrm=nupper=0;
 +    nlower=low_up_rm;
 +    for(i=0;i<ins_at->nr;i++)
 +    {
 +        at=ins_at->index[i];
 +        for(j=0;j<rest_at->nr;j++)
 +        {
 +            at2=rest_at->index[j];
 +            pbc_dx(pbc,r[at],r[at2],dr);
 +
 +            if(norm2(dr)<r_min_rad)
 +            {
 +                mol_id = get_mol_id(at2,mtop,&type,&block);
 +                bRM=TRUE;
 +                for(l=0;l<nrm;l++)
 +                {
 +                    if(rm_p->mol[l]==mol_id)
 +                    {
 +                        bRM=FALSE;
 +                    }
 +                }
 +
 +                if(bRM)
 +                {
 +                    rm_p->mol[nrm]=mol_id;
 +                    rm_p->block[nrm]=block;
 +                    nrm++;
 +                    z_lip=0.0;
 +                    for(l=0;l<mem_p->nmol;l++)
 +                    {
 +                        if(mol_id==mem_p->mol_id[l])
 +                        {
 +                            for(k=mtop->mols.index[mol_id];k<mtop->mols.index[mol_id+1];k++)
 +                            {
 +                                z_lip+=r[k][ZZ];
 +                            }
 +                            z_lip/=mtop->molblock[block].natoms_mol;
 +                            if(z_lip<mem_p->zmed)
 +                            {
 +                                nlower++;
 +                            }
 +                            else
 +                            {
 +                                nupper++;
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /*make sure equal number of lipids from upper and lower layer are removed */
 +    if( (nupper!=nlower) && (!bALLOW_ASYMMETRY) )
 +    {
 +        snew(dist,mem_p->nmol);
 +        snew(order,mem_p->nmol);
 +        for(i=0;i<mem_p->nmol;i++)
 +        {
 +            at = mtop->mols.index[mem_p->mol_id[i]];
 +            pbc_dx(pbc,r[at],pos_ins->geom_cent[0],dr);
 +            if (pos_ins->pieces>1)
 +            {
 +                /*minimum dr value*/
 +                min_norm=norm2(dr);
 +                for (k=1;k<pos_ins->pieces;k++)
 +                {
 +                    pbc_dx(pbc,r[at],pos_ins->geom_cent[k],dr_tmp);
 +                    if (norm2(dr_tmp) < min_norm)
 +                    {
 +                        min_norm=norm2(dr_tmp);
 +                        copy_rvec(dr_tmp,dr);
 +                    }
 +                }
 +            }
 +            dist[i]=dr[XX]*dr[XX]+dr[YY]*dr[YY];
 +            j=i-1;
 +            while (j>=0 && dist[i]<dist[order[j]])
 +            {
 +                order[j+1]=order[j];
 +                j--;
 +            }
 +            order[j+1]=i;
 +        }
 +
 +        i=0;
 +        while(nupper!=nlower)
 +        {
 +            mol_id=mem_p->mol_id[order[i]];
 +            block=get_molblock(mol_id,mtop->nmolblock,mtop->molblock);
 +            bRM=TRUE;
 +            for(l=0;l<nrm;l++)
 +            {
 +                if(rm_p->mol[l]==mol_id)
 +                {
 +                    bRM=FALSE;
 +                }
 +            }
 +
 +            if(bRM)
 +            {
 +                z_lip=0;
 +                for(k=mtop->mols.index[mol_id];k<mtop->mols.index[mol_id+1];k++)
 +                {
 +                    z_lip+=r[k][ZZ];
 +                }
 +                z_lip/=mtop->molblock[block].natoms_mol;
 +                if(nupper>nlower && z_lip<mem_p->zmed)
 +                {
 +                    rm_p->mol[nrm]=mol_id;
 +                    rm_p->block[nrm]=block;
 +                    nrm++;
 +                    nlower++;
 +                }
 +                else if (nupper<nlower && z_lip>mem_p->zmed)
 +                {
 +                    rm_p->mol[nrm]=mol_id;
 +                    rm_p->block[nrm]=block;
 +                    nrm++;
 +                    nupper++;
 +                }
 +            }
 +            i++;
 +
 +            if(i>mem_p->nmol)
 +            {
 +                gmx_fatal(FARGS,"Trying to remove more lipid molecules than there are in the membrane");
 +            }
 +        }
 +        sfree(dist);
 +        sfree(order);
 +    }
 +
 +    rm_p->nr=nrm;
 +    srenew(rm_p->mol,nrm);
 +    srenew(rm_p->block,nrm);
 +
 +    return nupper+nlower;
 +}
 +
 +/*remove all lipids and waters overlapping and update all important structures (e.g. state and mtop)*/
 +static void rm_group(t_inputrec *ir, gmx_groups_t *groups, gmx_mtop_t *mtop, rm_t *rm_p, t_state *state,
 +                     t_block *ins_at, pos_ins_t *pos_ins)
 +{
 +    int i,j,k,n,rm,mol_id,at,block;
 +    rvec *x_tmp,*v_tmp;
 +    atom_id *list,*new_mols;
 +    unsigned char  *new_egrp[egcNR];
 +    gmx_bool bRM;
 +    int RMmolblock;
 +
 +    snew(list,state->natoms);
 +    n=0;
 +    for(i=0;i<rm_p->nr;i++)
 +    {
 +        mol_id=rm_p->mol[i];
 +        at=mtop->mols.index[mol_id];
 +        block =rm_p->block[i];
 +        mtop->molblock[block].nmol--;
 +        for(j=0;j<mtop->molblock[block].natoms_mol;j++)
 +        {
 +            list[n]=at+j;
 +            n++;
 +        }
 +        mtop->mols.index[mol_id]=-1;
 +    }
 +
 +    mtop->mols.nr-=rm_p->nr;
 +    mtop->mols.nalloc_index-=rm_p->nr;
 +    snew(new_mols,mtop->mols.nr);
 +    for(i=0;i<mtop->mols.nr+rm_p->nr;i++)
 +    {
 +        j=0;
 +        if(mtop->mols.index[i]!=-1)
 +        {
 +            new_mols[j]=mtop->mols.index[i];
 +            j++;
 +        }
 +    }
 +    sfree(mtop->mols.index);
 +    mtop->mols.index=new_mols;
 +    mtop->natoms-=n;
 +    state->natoms-=n;
 +    state->nalloc=state->natoms;
 +    snew(x_tmp,state->nalloc);
 +    snew(v_tmp,state->nalloc);
 +
 +    for(i=0;i<egcNR;i++)
 +    {
 +        if(groups->grpnr[i]!=NULL)
 +        {
 +            groups->ngrpnr[i]=state->natoms;
 +            snew(new_egrp[i],state->natoms);
 +        }
 +    }
 +
 +    rm=0;
 +    for (i=0;i<state->natoms+n;i++)
 +    {
 +        bRM=FALSE;
 +        for(j=0;j<n;j++)
 +        {
 +            if(i==list[j])
 +            {
 +                bRM=TRUE;
 +                rm++;
 +            }
 +        }
 +
 +        if(!bRM)
 +        {
 +            for(j=0;j<egcNR;j++)
 +            {
 +                if(groups->grpnr[j]!=NULL)
 +                {
 +                    new_egrp[j][i-rm]=groups->grpnr[j][i];
 +                }
 +            }
 +            copy_rvec(state->x[i],x_tmp[i-rm]);
 +            copy_rvec(state->v[i],v_tmp[i-rm]);
 +            for(j=0;j<ins_at->nr;j++)
 +            {
 +                if (i==ins_at->index[j])
 +                {
 +                    ins_at->index[j]=i-rm;
 +                }
 +            }
 +
 +            for(j=0;j<pos_ins->pieces;j++)
 +            {
 +                for(k=0;k<pos_ins->nidx[j];k++)
 +                {
 +                    if (i==pos_ins->subindex[j][k])
 +                    {
 +                        pos_ins->subindex[j][k]=i-rm;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    sfree(state->x);
 +    state->x=x_tmp;
 +    sfree(state->v);
 +    state->v=v_tmp;
 +
 +    for(i=0;i<egcNR;i++)
 +    {
 +        if(groups->grpnr[i]!=NULL)
 +        {
 +            sfree(groups->grpnr[i]);
 +            groups->grpnr[i]=new_egrp[i];
 +        }
 +    }
 +
 +    /* remove empty molblocks */
 +    RMmolblock=0;
 +    for (i=0;i<mtop->nmolblock;i++)
 +    {
 +        if(mtop->molblock[i].nmol==0)
 +        {
 +            RMmolblock++;
 +        }
 +        else
 +        {
 +            mtop->molblock[i-RMmolblock]=mtop->molblock[i];
 +        }
 +    }
 +    mtop->nmolblock-=RMmolblock;
 +}
 +
 +/* remove al bonded interactions from mtop for the molecule to be embedded */
 +int rm_bonded(t_block *ins_at, gmx_mtop_t *mtop)
 +{
 +    int i,j,m;
 +    int type,natom,nmol,at,atom1=0,rm_at=0;
 +    gmx_bool *bRM,bINS;
 +    /*this routine lives dangerously by assuming that all molecules of a given type are in order in the structure*/
 +    /*this routine does not live as dangerously as it seems. There is namely a check in init_membed to make *
 +     *sure that g_membed exits with a warning when there are molecules of the same type not in the *
 +     *ins_at index group. MGWolf 050710 */
 +
 +
 +    snew(bRM,mtop->nmoltype);
 +    for (i=0;i<mtop->nmoltype;i++)
 +    {
 +        bRM[i]=TRUE;
 +    }
 +
 +    for (i=0;i<mtop->nmolblock;i++)
 +    {
 +        /*loop over molecule blocks*/
 +        type        =mtop->molblock[i].type;
 +        natom        =mtop->molblock[i].natoms_mol;
 +        nmol        =mtop->molblock[i].nmol;
 +
 +        for(j=0;j<natom*nmol && bRM[type]==TRUE;j++)
 +        {
 +            /*loop over atoms in the block*/
 +            at=j+atom1; /*atom index = block index + offset*/
 +            bINS=FALSE;
 +
 +            for (m=0;(m<ins_at->nr) && (bINS==FALSE);m++)
 +            {
 +                /*loop over atoms in insertion index group to determine if we're inserting one*/
 +                if(at==ins_at->index[m])
 +                {
 +                    bINS=TRUE;
 +                }
 +            }
 +            bRM[type]=bINS;
 +        }
 +        atom1+=natom*nmol; /*update offset*/
 +        if(bRM[type])
 +        {
 +            rm_at+=natom*nmol; /*increment bonded removal counter by # atoms in block*/
 +        }
 +    }
 +
 +    for(i=0;i<mtop->nmoltype;i++)
 +    {
 +        if(bRM[i])
 +        {
 +            for(j=0;j<F_LJ;j++)
 +            {
 +                mtop->moltype[i].ilist[j].nr=0;
 +            }
 +
 +            for(j=F_POSRES;j<=F_VSITEN;j++)
 +            {
 +                mtop->moltype[i].ilist[j].nr=0;
 +            }
 +        }
 +    }
 +    sfree(bRM);
 +
 +    return rm_at;
 +}
 +
 +/* Write a topology where the number of molecules is correct for the system after embedding */
 +static void top_update(const char *topfile, char *ins, rm_t *rm_p, gmx_mtop_t *mtop)
 +{
 +#define TEMP_FILENM "temp.top"
 +    int    bMolecules=0;
 +    FILE    *fpin,*fpout;
 +    char    buf[STRLEN],buf2[STRLEN],*temp;
 +    int        i,*nmol_rm,nmol,line;
 +
 +    fpin  = ffopen(topfile,"r");
 +    fpout = ffopen(TEMP_FILENM,"w");
 +
 +    snew(nmol_rm,mtop->nmoltype);
 +    for(i=0;i<rm_p->nr;i++)
 +    {
 +        nmol_rm[rm_p->block[i]]++;
 +    }
 +
 +    line=0;
 +    while(fgets(buf,STRLEN,fpin))
 +    {
 +        line++;
 +        if(buf[0]!=';')
 +        {
 +            strcpy(buf2,buf);
 +            if ((temp=strchr(buf2,'\n')) != NULL)
 +            {
 +                temp[0]='\0';
 +            }
 +            ltrim(buf2);
 +            if (buf2[0]=='[')
 +            {
 +                buf2[0]=' ';
 +                if ((temp=strchr(buf2,'\n')) != NULL)
 +                {
 +                    temp[0]='\0';
 +                }
 +                rtrim(buf2);
 +                if (buf2[strlen(buf2)-1]==']')
 +                {
 +                    buf2[strlen(buf2)-1]='\0';
 +                    ltrim(buf2);
 +                    rtrim(buf2);
 +                    if (gmx_strcasecmp(buf2,"molecules")==0)
 +                    {
 +                        bMolecules=1;
 +                    }
 +                }
 +                fprintf(fpout,"%s",buf);
 +            }
 +            else if (bMolecules==1)
 +            {
 +                for(i=0;i<mtop->nmolblock;i++)
 +                {
 +                    nmol=mtop->molblock[i].nmol;
 +                    sprintf(buf,"%-15s %5d\n",*(mtop->moltype[mtop->molblock[i].type].name),nmol);
 +                    fprintf(fpout,"%s",buf);
 +                }
 +                bMolecules=2;
 +            }
 +            else if (bMolecules==2)
 +            {
 +                /* print nothing */
 +            }
 +            else
 +            {
 +                fprintf(fpout,"%s",buf);
 +            }
 +        }
 +        else
 +        {
 +            fprintf(fpout,"%s",buf);
 +        }
 +    }
 +
 +    ffclose(fpout);
 +    /* use ffopen to generate backup of topinout */
 +    fpout=ffopen(topfile,"w");
 +    ffclose(fpout);
 +    rename(TEMP_FILENM,topfile);
 +#undef TEMP_FILENM
 +}
 +
 +void rescale_membed(int step_rel, gmx_membed_t membed, rvec *x)
 +{
 +    /* Set new positions for the group to embed */
 +    if(step_rel<=membed->it_xy)
 +    {
 +        membed->fac[0]+=membed->xy_step;
 +        membed->fac[1]+=membed->xy_step;
 +    }
 +    else if (step_rel<=(membed->it_xy+membed->it_z))
 +    {
 +        membed->fac[2]+=membed->z_step;
 +    }
 +    resize(membed->r_ins,x,membed->pos_ins,membed->fac);
 +}
 +
 +gmx_membed_t init_membed(FILE *fplog, int nfile, const t_filenm fnm[], gmx_mtop_t *mtop,
 +                 t_inputrec *inputrec, t_state *state, t_commrec *cr,real *cpt)
 +{
 +    char                    *ins,**gnames;
 +    int                     i,rm_bonded_at,fr_id,fr_i=0,tmp_id,warn=0;
 +    int                     ng,j,max_lip_rm,ins_grp_id,ins_nat,mem_nat,ntype,lip_rm,tpr_version;
 +    real                    prot_area;
 +    rvec                    *r_ins=NULL;
 +    t_block                 *ins_at,*rest_at;
 +    pos_ins_t               *pos_ins;
 +    mem_t                   *mem_p;
 +    rm_t                    *rm_p;
 +    gmx_groups_t            *groups;
 +    gmx_bool                bExcl=FALSE;
 +    t_atoms                 atoms;
 +    t_pbc                   *pbc;
 +    char                    **piecename=NULL;
 +    gmx_membed_t            membed=NULL;
 +
 +    /* input variables */
 +    const char *membed_input;
 +    real xy_fac = 0.5;
 +    real xy_max = 1.0;
 +    real z_fac = 1.0;
 +    real z_max = 1.0;
 +    int it_xy = 1000;
 +    int it_z = 0;
 +    real probe_rad = 0.22;
 +    int low_up_rm = 0;
 +    int maxwarn=0;
 +    int pieces=1;
 +    gmx_bool bALLOW_ASYMMETRY=FALSE;
 +
 +    /* sanity check constants */         /* Issue a warning when: */
 +    const int membed_version=58;         /* tpr version is smaller */
 +    const real min_probe_rad=0.2199999;  /* A probe radius for overlap between embedded molecule *
 +                                          * and rest smaller than this value is probably too small */
 +    const real min_xy_init=0.0999999;    /* the initial shrinking of the molecule to embed is smaller */
 +    const int min_it_xy=1000;            /* the number of steps to embed in xy-plane is smaller */
 +    const int min_it_z=100;              /* the number of steps to embed in z is smaller */
 +    const real prot_vs_box=7.5;          /* molecule to embed is large (more then prot_vs_box) with respect */
 +    const real box_vs_prot=50;           /* to the box size (less than box_vs_prot) */
 +
 +    snew(membed,1);
 +    snew(ins_at,1);
 +    snew(pos_ins,1);
 +
 +    if(MASTER(cr))
 +    {
 +        /* get input data out membed file */
 +        membed_input = opt2fn("-membed",nfile,fnm);
 +        get_input(membed_input,&xy_fac,&xy_max,&z_fac,&z_max,&it_xy,&it_z,&probe_rad,&low_up_rm,
 +                  &maxwarn,&pieces,&bALLOW_ASYMMETRY);
 +
 +        tpr_version = get_tpr_version(ftp2fn(efTPX,nfile,fnm));
 +        if (tpr_version<membed_version)
 +        {
 +            gmx_fatal(FARGS,"Version of *.tpr file to old (%d). "
 +                            "Rerun grompp with GROMACS version 4.0.3 or newer.\n",tpr_version);
 +        }
 +
 +        if( !EI_DYNAMICS(inputrec->eI) )
 +        {
 +            gmx_input("Change integrator to a dynamics integrator in mdp file (e.g. md or sd).");
 +        }
 +
 +        if(PAR(cr))
 +        {
 +            gmx_input("Sorry, parallel g_membed is not yet fully functional.");
 +        }
 +
 +        if(*cpt>=0)
 +        {
 +            fprintf(stderr,"\nSetting -cpt to -1, because embedding cannot be restarted from cpt-files.\n");
 +             *cpt=-1;
 +        }
 +        groups=&(mtop->groups);
 +        snew(gnames,groups->ngrpname);
 +        for (i=0; i<groups->ngrpname; i++)
 +        {
 +            gnames[i] = *(groups->grpname[i]);
 +        }
 +
 +        atoms=gmx_mtop_global_atoms(mtop);
 +        snew(mem_p,1);
 +        fprintf(stderr,"\nSelect a group to embed in the membrane:\n");
 +        get_index(&atoms,opt2fn_null("-mn",nfile,fnm),1,&(ins_at->nr),&(ins_at->index),&ins);
 +        ins_grp_id = search_string(ins,groups->ngrpname,gnames);
 +        fprintf(stderr,"\nSelect a group to embed %s into (e.g. the membrane):\n",ins);
 +        get_index(&atoms,opt2fn_null("-mn",nfile,fnm),1,&(mem_p->mem_at.nr),&(mem_p->mem_at.index),&(mem_p->name));
 +
 +        pos_ins->pieces=pieces;
 +        snew(pos_ins->nidx,pieces);
 +        snew(pos_ins->subindex,pieces);
 +        snew(piecename,pieces);
 +        if (pieces>1)
 +        {
 +            fprintf(stderr,"\nSelect pieces to embed:\n");
 +            get_index(&atoms,opt2fn_null("-mn",nfile,fnm),pieces,pos_ins->nidx,pos_ins->subindex,piecename);
 +        }
 +        else
 +        {
 +            /*use whole embedded group*/
 +            snew(pos_ins->nidx,1);
 +            snew(pos_ins->subindex,1);
 +            pos_ins->nidx[0]=ins_at->nr;
 +            pos_ins->subindex[0]=ins_at->index;
 +        }
 +
 +        if(probe_rad<min_probe_rad)
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d:\nA probe radius (-rad) smaller than 0.2 nm can result "
 +                           "in overlap between waters and the group to embed, which will result "
 +                           "in Lincs errors etc.\n\n",warn);
 +        }
 +
 +        if(xy_fac<min_xy_init)
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d:\nThe initial size of %s is probably too smal.\n\n",warn,ins);
 +        }
 +
 +        if(it_xy<min_it_xy)
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d;\nThe number of steps used to grow the xy-coordinates of %s (%d)"
 +                           " is probably too small.\nIncrease -nxy or.\n\n",warn,ins,it_xy);
 +        }
 +
 +        if( (it_z<min_it_z) && ( z_fac<0.99999999 || z_fac>1.0000001) )
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d;\nThe number of steps used to grow the z-coordinate of %s (%d)"
 +                           " is probably too small.\nIncrease -nz or maxwarn.\n\n",warn,ins,it_z);
 +        }
 +
 +        if(it_xy+it_z>inputrec->nsteps)
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d:\nThe number of growth steps (-nxy + -nz) is larger than the "
 +                           "number of steps in the tpr.\n\n",warn);
 +        }
 +
 +        fr_id=-1;
 +        if( inputrec->opts.ngfrz==1)
 +        {
 +            gmx_fatal(FARGS,"You did not specify \"%s\" as a freezegroup.",ins);
 +        }
 +
 +        for(i=0;i<inputrec->opts.ngfrz;i++)
 +        {
 +            tmp_id = mtop->groups.grps[egcFREEZE].nm_ind[i];
 +            if(ins_grp_id==tmp_id)
 +            {
 +                fr_id=tmp_id;
 +                fr_i=i;
 +            }
 +        }
 +
 +        if (fr_id == -1 )
 +        {
 +            gmx_fatal(FARGS,"\"%s\" not as freezegroup defined in the mdp-file.",ins);
 +        }
 +
 +        for(i=0;i<DIM;i++)
 +        {
 +            if( inputrec->opts.nFreeze[fr_i][i] != 1)
 +            {
 +                gmx_fatal(FARGS,"freeze dimensions for %s are not Y Y Y\n",ins);
 +            }
 +        }
 +
 +        ng = groups->grps[egcENER].nr;
 +        if (ng == 1)
 +        {
 +            gmx_input("No energy groups defined. This is necessary for energy exclusion in the freeze group");
 +        }
 +
 +        for(i=0;i<ng;i++)
 +        {
 +            for(j=0;j<ng;j++)
 +            {
 +                if (inputrec->opts.egp_flags[ng*i+j] == EGP_EXCL)
 +                {
 +                    bExcl = TRUE;
 +                    if ( (groups->grps[egcENER].nm_ind[i] != ins_grp_id) ||
 +                         (groups->grps[egcENER].nm_ind[j] != ins_grp_id) )
 +                    {
 +                        gmx_fatal(FARGS,"Energy exclusions \"%s\" and  \"%s\" do not match the group "
 +                                  "to embed \"%s\"",*groups->grpname[groups->grps[egcENER].nm_ind[i]],
 +                                  *groups->grpname[groups->grps[egcENER].nm_ind[j]],ins);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (!bExcl) {
 +            gmx_input("No energy exclusion groups defined. This is necessary for energy exclusion in "
 +                      "the freeze group");
 +        }
 +
 +        /* Obtain the maximum and minimum coordinates of the group to be embedded */
 +        snew(rest_at,1);
 +        ins_nat = init_ins_at(ins_at,rest_at,state,pos_ins,groups,ins_grp_id,xy_max);
 +        /* Check that moleculetypes in insertion group are not part of the rest of the system */
 +        check_types(ins_at,rest_at,mtop);
 +
 +        mem_nat = init_mem_at(mem_p,mtop,state->x,state->box,pos_ins);
 +
 +        prot_area = est_prot_area(pos_ins,state->x,ins_at,mem_p);
 +        if ( (prot_area>prot_vs_box) && ( (state->box[XX][XX]*state->box[YY][YY]-state->box[XX][YY]*state->box[YY][XX])<box_vs_prot) )
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d:\nThe xy-area is very small compared to the area of the protein.\n"
 +                    "This might cause pressure problems during the growth phase. Just try with\n"
 +                    "current setup (-maxwarn + 1), but if pressure problems occur, lower the\n"
 +                    "compressibility in the mdp-file or use no pressure coupling at all.\n\n",warn);
 +        }
 +
 +        if(warn>maxwarn)
 +        {
 +            gmx_fatal(FARGS,"Too many warnings.\n");
 +        }
 +
 +        printf("The estimated area of the protein in the membrane is %.3f nm^2\n",prot_area);
 +        printf("\nThere are %d lipids in the membrane part that overlaps the protein.\n"
 +               "The area per lipid is %.4f nm^2.\n",mem_p->nmol,mem_p->lip_area);
 +
 +        /* Maximum number of lipids to be removed*/
 +        max_lip_rm=(int)(2*prot_area/mem_p->lip_area);
 +        printf("Maximum number of lipids that will be removed is %d.\n",max_lip_rm);
 +
 +        printf("\nWill resize the protein by a factor of %.3f in the xy plane and %.3f in the z direction.\n"
 +               "This resizing will be done with respect to the geometrical center of all protein atoms\n"
 +               "that span the membrane region, i.e. z between %.3f and %.3f\n\n",
 +               xy_fac,z_fac,mem_p->zmin,mem_p->zmax);
 +
 +        /* resize the protein by xy and by z if necessary*/
 +        snew(r_ins,ins_at->nr);
 +        init_resize(ins_at,r_ins,pos_ins,mem_p,state->x,bALLOW_ASYMMETRY);
 +        membed->fac[0]=membed->fac[1]=xy_fac;
 +        membed->fac[2]=z_fac;
 +
 +        membed->xy_step =(xy_max-xy_fac)/(double)(it_xy);
 +        membed->z_step  =(z_max-z_fac)/(double)(it_z-1);
 +
 +        resize(r_ins,state->x,pos_ins,membed->fac);
 +
 +        /* remove overlapping lipids and water from the membrane box*/
 +        /*mark molecules to be removed*/
 +        snew(pbc,1);
 +        set_pbc(pbc,inputrec->ePBC,state->box);
 +
 +        snew(rm_p,1);
 +        lip_rm = gen_rm_list(rm_p,ins_at,rest_at,pbc,mtop,state->x, r_ins, mem_p,pos_ins,
 +                             probe_rad,low_up_rm,bALLOW_ASYMMETRY);
 +        lip_rm -= low_up_rm;
 +
 +        if(fplog)
 +        {
 +            for(i=0;i<rm_p->nr;i++)
 +            {
 +                fprintf(fplog,"rm mol %d\n",rm_p->mol[i]);
 +            }
 +        }
 +
 +        for(i=0;i<mtop->nmolblock;i++)
 +        {
 +            ntype=0;
 +            for(j=0;j<rm_p->nr;j++)
 +            {
 +                if(rm_p->block[j]==i)
 +                {
 +                    ntype++;
 +                }
 +            }
 +            printf("Will remove %d %s molecules\n",ntype,*(mtop->moltype[mtop->molblock[i].type].name));
 +        }
 +
 +        if(lip_rm>max_lip_rm)
 +        {
 +            warn++;
 +            fprintf(stderr,"\nWarning %d:\nTrying to remove a larger lipid area than the estimated "
 +                           "protein area\nTry making the -xyinit resize factor smaller or increase "
 +                           "maxwarn.\n\n",warn);
 +        }
 +
 +        /*remove all lipids and waters overlapping and update all important structures*/
 +        rm_group(inputrec,groups,mtop,rm_p,state,ins_at,pos_ins);
 +
 +        rm_bonded_at = rm_bonded(ins_at,mtop);
 +        if (rm_bonded_at != ins_at->nr)
 +        {
 +            fprintf(stderr,"Warning: The number of atoms for which the bonded interactions are removed is %d, "
 +                    "while %d atoms are embedded. Make sure that the atoms to be embedded are not in the same"
 +                    "molecule type as atoms that are not to be embedded.\n",rm_bonded_at,ins_at->nr);
 +        }
 +
 +        if(warn>maxwarn)
 +        {
 +            gmx_fatal(FARGS,"Too many warnings.\nIf you are sure these warnings are harmless, "
 +                            "you can increase -maxwarn");
 +        }
 +
 +        if (ftp2bSet(efTOP,nfile,fnm))
 +        {
 +            top_update(opt2fn("-mp",nfile,fnm),ins,rm_p,mtop);
 +        }
 +
 +        sfree(pbc);
 +        sfree(rest_at);
 +        if (pieces>1)
 +        {
 +            sfree(piecename);
 +        }
 +
 +        membed->it_xy=it_xy;
 +        membed->it_z=it_z;
 +        membed->pos_ins=pos_ins;
 +        membed->r_ins=r_ins;
 +    }
 +
 +    return membed;
 +}
index 44559ff4a9321ba3550f356d32ca89efc5fb648a,0000000000000000000000000000000000000000..3fff053d1dae1f9e1f6f339887e9c8e3c9beeae5
mode 100644,000000..100644
--- /dev/null
@@@ -1,744 -1,0 +1,761 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 4.6.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2011, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "smalloc.h"
 +#include "network.h"
 +#include "calcgrid.h"
 +#include "pme.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "pme_loadbal.h"
 +
 +/* Parameters and setting for one PP-PME setup */
 +typedef struct {
 +    real rcut_coulomb;    /* Coulomb cut-off                              */
 +    real rlist;           /* pair-list cut-off                            */
 +    real rlistlong;       /* LR pair-list cut-off                         */
 +    int  nstcalclr;       /* frequency of evaluating long-range forces for group scheme */
 +    real spacing;         /* (largest) PME grid spacing                   */
 +    ivec grid;            /* the PME grid dimensions                      */
 +    real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
 +    real ewaldcoeff;      /* the Ewald coefficient                        */
 +    gmx_pme_t pmedata;    /* the data structure used in the PME code      */
 +
 +    int  count;           /* number of times this setup has been timed    */
 +    double cycles;        /* the fastest time for this setup in cycles    */
 +} pme_setup_t;
 +
 +/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
 +#define PME_LB_GRID_SCALE_FAC  0.8
 +/* In the initial scan, try to skip grids with uneven x/y/z spacing,
 + * checking if the "efficiency" is more than 5% worse than the previous grid.
 + */
 +#define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
 +/* Rerun up till 12% slower setups than the fastest up till now */
 +#define PME_LB_SLOW_FAC  1.12
 +/* If setups get more than 2% faster, do another round to avoid
 + * choosing a slower setup due to acceleration or fluctuations.
 + */
 +#define PME_LB_ACCEL_TOL 1.02
 +
 +enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
 +
 +const char *pmelblim_str[epmelblimNR] =
 +{ "no", "box size", "domain decompostion" };
 +
 +struct pme_load_balancing {
 +    int  nstage;        /* the current maximum number of stages */
 +
 +    real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
 +    real rcut_vdw;      /* Vdw cutoff (does not change) */
 +    real rcut_coulomb_start; /* Initial electrostatics cutoff */
 +    int  nstcalclr_start; /* Initial electrostatics cutoff */
 +    real rbuf_coulomb;  /* the pairlist buffer size */
 +    real rbuf_vdw;      /* the pairlist buffer size */
 +    matrix box_start;   /* the initial simulation box */
 +    int n;              /* the count of setup as well as the allocation size */
 +    pme_setup_t *setup; /* the PME+cutoff setups */
 +    int cur;            /* the current setup */
 +    int fastest;        /* fastest setup up till now */
 +    int start;          /* start of setup range to consider in stage>0 */
 +    int end;            /* end   of setup range to consider in stage>0 */
 +    int elimited;       /* was the balancing limited, uses enum above */
 +    int cutoff_scheme;  /* Verlet or group cut-offs */
 +
 +    int stage;          /* the current stage */
 +};
 +
 +void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
 +                      const t_inputrec *ir,matrix box,
 +                      const interaction_const_t *ic,
 +                      gmx_pme_t pmedata)
 +{
 +    pme_load_balancing_t pme_lb;
 +    real spm,sp;
 +    int  d;
 +
 +    snew(pme_lb,1);
 +
 +    /* Any number of stages >= 2 is supported */
 +    pme_lb->nstage   = 2;
 +
 +    pme_lb->cutoff_scheme = ir->cutoff_scheme;
 +
 +    if(pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        pme_lb->rbuf_vdw     = pme_lb->rbuf_coulomb;
 +    }
 +    else
 +    {
 +        if(ic->rcoulomb > ic->rlist)
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        }
 +        if(ic->rvdw > ic->rlist)
 +        {
 +            pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
 +        }
 +    }
 +
 +    copy_mat(box,pme_lb->box_start);
 +    if (ir->ePBC==epbcXY && ir->nwall==2)
 +    {
 +        svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
 +    }
 +
 +    pme_lb->n = 1;
 +    snew(pme_lb->setup,pme_lb->n);
 +
 +    pme_lb->rcut_vdw              = ic->rvdw;
 +    pme_lb->rcut_coulomb_start    = ir->rcoulomb;
 +    pme_lb->nstcalclr_start       = ir->nstcalclr;
 +    
 +    pme_lb->cur = 0;
 +    pme_lb->setup[0].rcut_coulomb = ic->rcoulomb;
 +    pme_lb->setup[0].rlist        = ic->rlist;
 +    pme_lb->setup[0].rlistlong    = ic->rlistlong;
 +    pme_lb->setup[0].nstcalclr    = ir->nstcalclr;
 +    pme_lb->setup[0].grid[XX]     = ir->nkx;
 +    pme_lb->setup[0].grid[YY]     = ir->nky;
 +    pme_lb->setup[0].grid[ZZ]     = ir->nkz;
 +    pme_lb->setup[0].ewaldcoeff   = ic->ewaldcoeff;
 +
 +    pme_lb->setup[0].pmedata  = pmedata;
 +    
 +    spm = 0;
 +    for(d=0; d<DIM; d++)
 +    {
 +        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
 +        if (sp > spm)
 +        {
 +            spm = sp;
 +        }
 +    }
 +    pme_lb->setup[0].spacing = spm;
 +
 +    if (ir->fourier_spacing > 0)
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
 +    }
 +
 +    pme_lb->stage = 0;
 +
 +    pme_lb->fastest  = 0;
 +    pme_lb->start    = 0;
 +    pme_lb->end      = 0;
 +    pme_lb->elimited = epmelblimNO;
 +
 +    *pme_lb_p = pme_lb;
 +}
 +
 +static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
 +                                            int pme_order)
 +{
 +    pme_setup_t *set;
 +    real fac,sp;
 +    real tmpr_coulomb,tmpr_vdw;
 +    int d;
 +
 +    /* Try to add a new setup with next larger cut-off to the list */
 +    pme_lb->n++;
 +    srenew(pme_lb->setup,pme_lb->n);
 +    set = &pme_lb->setup[pme_lb->n-1];
 +    set->pmedata = NULL;
 +
 +    fac = 1;
 +    do
 +    {
 +        fac *= 1.01;
 +        clear_ivec(set->grid);
 +        sp = calc_grid(NULL,pme_lb->box_start,
 +                       fac*pme_lb->setup[pme_lb->cur].spacing,
 +                       &set->grid[XX],
 +                       &set->grid[YY],
 +                       &set->grid[ZZ]);
 +
 +        /* In parallel we can't have grids smaller than 2*pme_order,
 +         * and we would anyhow not gain much speed at these grid sizes.
 +         */
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (set->grid[d] <= 2*pme_order)
 +            {
 +                pme_lb->n--;
 +
 +                return FALSE;
 +            }
 +        }
 +    }
 +    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
 +
 +    set->rcut_coulomb = pme_lb->cut_spacing*sp;
 +
 +    if(pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        set->rlist        = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        /* We dont use LR lists with Verlet, but this avoids if-statements in further checks */
 +        set->rlistlong    = set->rlist;
 +    }
 +    else
 +    {
 +        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
 +        set->rlist            = min(tmpr_coulomb,tmpr_vdw);
 +        set->rlistlong        = max(tmpr_coulomb,tmpr_vdw);
 +        
 +        /* Set the long-range update frequency */
 +        if(set->rlist == set->rlistlong)
 +        {
 +            /* No long-range interactions if the short-/long-range cutoffs are identical */
 +            set->nstcalclr = 0;
 +        }
 +        else if(pme_lb->nstcalclr_start==0 || pme_lb->nstcalclr_start==1)
 +        {
 +            /* We were not doing long-range before, but now we are since rlist!=rlistlong */
 +            set->nstcalclr = 1;
 +        }
 +        else
 +        {
 +            /* We were already doing long-range interactions from the start */
 +            if(pme_lb->rcut_vdw > pme_lb->rcut_coulomb_start)
 +            {
 +                /* We were originally doing long-range VdW-only interactions.
 +                 * If rvdw is still longer than rcoulomb we keep the original nstcalclr,
 +                 * but if the coulomb cutoff has become longer we should update the long-range
 +                 * part every step.
 +                 */
 +                set->nstcalclr = (tmpr_vdw > tmpr_coulomb) ? pme_lb->nstcalclr_start : 1;
 +            }
 +            else
 +            {
 +                /* We were not doing any long-range interaction from the start,
 +                 * since it is not possible to do twin-range coulomb for the PME interaction.
 +                 */
 +                set->nstcalclr = 1;
 +            }
 +        }
 +    }
 +    
 +    set->spacing      = sp;
 +    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
 +    set->grid_efficiency = 1;
 +    for(d=0; d<DIM; d++)
 +    {
 +        set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
 +    }
 +    /* The Ewald coefficient is inversly proportional to the cut-off */
 +    set->ewaldcoeff =
 +        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
 +
 +    set->count   = 0;
 +    set->cycles  = 0;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"PME loadbal: grid %d %d %d, coulomb cutoff %f\n",
 +                set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut_coulomb);
 +    }
 +    return TRUE;
 +}
 +
 +static void print_grid(FILE *fp_err,FILE *fp_log,
 +                       const char *pre,
 +                       const char *desc,
 +                       const pme_setup_t *set,
 +                       double cycles)
 +{
 +    char buf[STRLEN],buft[STRLEN];
 +
 +    if (cycles >= 0)
 +    {
 +        sprintf(buft,": %.1f M-cycles",cycles*1e-6);
 +    }
 +    else
 +    {
 +        buft[0] = '\0';
 +    }
 +    sprintf(buf,"%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f%s",
 +            pre,
 +            desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut_coulomb,
 +            buft);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err,"\r%s\n",buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log,"%s\n",buf);
 +    }
 +}
 +
 +static int pme_loadbal_end(pme_load_balancing_t pme_lb)
 +{
 +    /* In the initial stage only n is set; end is not set yet */
 +    if (pme_lb->end > 0)
 +    {
 +        return pme_lb->end;
 +    }
 +    else
 +    {
 +        return pme_lb->n;
 +    }
 +}
 +
 +static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
 +                                  gmx_large_int_t step,
 +                                  pme_load_balancing_t pme_lb)
 +{
 +    char buf[STRLEN],sbuf[22];
 +
 +    sprintf(buf,"step %4s: the %s limited the PME load balancing to a coulomb cut-off of %.3f",
 +            gmx_step_str(step,sbuf),
 +            pmelblim_str[pme_lb->elimited],
 +            pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err,"\r%s\n",buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log,"%s\n",buf);
 +    }
 +}
 +
 +static void switch_to_stage1(pme_load_balancing_t pme_lb)
 +{
 +    pme_lb->start = 0;
 +    while (pme_lb->start+1 < pme_lb->n &&
 +           (pme_lb->setup[pme_lb->start].count == 0 ||
 +            pme_lb->setup[pme_lb->start].cycles >
 +            pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
 +    {
 +        pme_lb->start++;
 +    }
 +    while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
 +    {
 +        pme_lb->start--;
 +    }
 +
 +    pme_lb->end = pme_lb->n;
 +    if (pme_lb->setup[pme_lb->end-1].count > 0 &&
 +        pme_lb->setup[pme_lb->end-1].cycles >
 +        pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->end--;
 +    }
 +
 +    pme_lb->stage = 1;
 +
 +    /* Next we want to choose setup pme_lb->start, but as we will increase
 +     * pme_ln->cur by one right after returning, we subtract 1 here.
 +     */
 +    pme_lb->cur = pme_lb->start - 1;
 +}
 +
 +gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
 +                          t_commrec *cr,
 +                          FILE *fp_err,
 +                          FILE *fp_log,
 +                          t_inputrec *ir,
 +                          t_state *state,
 +                          double cycles,
 +                          interaction_const_t *ic,
 +                          nonbonded_verlet_t *nbv,
 +                          gmx_pme_t *pmedata,
 +                          gmx_large_int_t step)
 +{
 +    gmx_bool OK;
 +    pme_setup_t *set;
 +    double cycles_fast;
 +    char buf[STRLEN],sbuf[22];
 +    real rtab;
 +    gmx_bool bUsesSimpleTables = TRUE;
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        return FALSE;
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1,&cycles,cr);
 +        cycles /= cr->nnodes;
 +    }
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +    set->count++;
 +
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (set->count % 2 == 1)
 +    {
 +        /* Skip the first cycle, because the first step after a switch
 +         * is much slower due to allocation and/or caching effects.
 +         */
 +        return TRUE;
 +    }
 +
 +    sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
 +    print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
 +
 +    if (set->count <= 2)
 +    {
 +        set->cycles = cycles;
 +    }
 +    else
 +    {
 +        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
 +            pme_lb->stage == pme_lb->nstage - 1)
 +        {
 +            /* The performance went up a lot (due to e.g. DD load balancing).
 +             * Add a stage, keep the minima, but rescan all setups.
 +             */
 +            pme_lb->nstage++;
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
 +                        "Increased the number stages to %d"
 +                        " and ignoring the previous performance\n",
 +                        set->grid[XX],set->grid[YY],set->grid[ZZ],
 +                        cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
 +                        pme_lb->nstage);
 +            }
 +        }
 +        set->cycles = min(set->cycles,cycles);
 +    }
 +
 +    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
 +    {
 +        pme_lb->fastest = pme_lb->cur;
++
++        if (DOMAINDECOMP(cr))
++        {
++            /* We found a new fastest setting, ensure that with subsequent
++             * shorter cut-off's the dynamic load balancing does not make
++             * the use of the current cut-off impossible. This solution is
++             * a trade-off, as the PME load balancing and DD domain size
++             * load balancing can interact in complex ways.
++             * With the Verlet kernels, DD load imbalance will usually be
++             * mainly due to bonded interaction imbalance, which will often
++             * quickly push the domain boundaries beyond the limit for the
++             * optimal, PME load balanced, cut-off. But it could be that
++             * better overal performance can be obtained with a slightly
++             * shorter cut-off and better DD load balancing.
++             */
++            change_dd_dlb_cutoff_limit(cr);
++        }
 +    }
 +    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
 +
 +    /* Check in stage 0 if we should stop scanning grids.
 +     * Stop when the time is more than SLOW_FAC longer than the fastest.
 +     */
 +    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
 +        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->n = pme_lb->cur + 1;
 +        /* Done with scanning, go to stage 1 */
 +        switch_to_stage1(pme_lb);
 +    }
 +
 +    if (pme_lb->stage == 0)
 +    {
 +        int gridsize_start;
 +
 +        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
 +
 +        do
 +        {
 +            if (pme_lb->cur+1 < pme_lb->n)
 +            {
 +                /* We had already generated the next setup */
 +                OK = TRUE;
 +            }
 +            else
 +            {
 +                /* Find the next setup */
 +                OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
 +            }
 +
 +            if (OK && ir->ePBC != epbcNONE)
 +            {
 +                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
 +                      <= max_cutoff2(ir->ePBC,state->box));
 +                if (!OK)
 +                {
 +                    pme_lb->elimited = epmelblimBOX;
 +                }
 +            }
 +
 +            if (OK)
 +            {
 +                pme_lb->cur++;
 +
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    OK = change_dd_cutoff(cr,state,ir,
 +                                          pme_lb->setup[pme_lb->cur].rlistlong);
 +                    if (!OK)
 +                    {
 +                        /* Failed: do not use this setup */
 +                        pme_lb->cur--;
 +                        pme_lb->elimited = epmelblimDD;
 +                    }
 +                }
 +            }
 +            if (!OK)
 +            {
 +                /* We hit the upper limit for the cut-off,
 +                 * the setup should not go further than cur.
 +                 */
 +                pme_lb->n = pme_lb->cur + 1;
 +                print_loadbal_limited(fp_err,fp_log,step,pme_lb);
 +                /* Switch to the next stage */
 +                switch_to_stage1(pme_lb);
 +            }
 +        }
 +        while (OK &&
 +               !(pme_lb->setup[pme_lb->cur].grid[XX]*
 +                 pme_lb->setup[pme_lb->cur].grid[YY]*
 +                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
 +                 gridsize_start*PME_LB_GRID_SCALE_FAC
 +                 &&
 +                 pme_lb->setup[pme_lb->cur].grid_efficiency <
 +                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
 +    }
 +
 +    if (pme_lb->stage > 0 && pme_lb->end == 1)
 +    {
 +        pme_lb->cur = 0;
 +        pme_lb->stage = pme_lb->nstage;
 +    }
 +    else if (pme_lb->stage > 0 && pme_lb->end > 1)
 +    {
 +        /* If stage = nstage-1:
 +         *   scan over all setups, rerunning only those setups
 +         *   which are not much slower than the fastest
 +         * else:
 +         *   use the next setup
 +         */
 +        do
 +        {
 +            pme_lb->cur++;
 +            if (pme_lb->cur == pme_lb->end)
 +            {
 +                pme_lb->stage++;
 +                pme_lb->cur = pme_lb->start;
 +            }
 +        }
 +        while (pme_lb->stage == pme_lb->nstage - 1 &&
 +               pme_lb->setup[pme_lb->cur].count > 0 &&
 +               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
 +
 +        if (pme_lb->stage == pme_lb->nstage)
 +        {
 +            /* We are done optimizing, use the fastest setup we found */
 +            pme_lb->cur = pme_lb->fastest;
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
 +    {
 +        OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlistlong);
 +        if (!OK)
 +        {
 +            /* Failsafe solution */
 +            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
 +            {
 +                pme_lb->stage--;
 +            }
 +            pme_lb->fastest  = 0;
 +            pme_lb->start    = 0;
 +            pme_lb->end      = pme_lb->cur;
 +            pme_lb->cur      = pme_lb->start;
 +            pme_lb->elimited = epmelblimDD;
 +            print_loadbal_limited(fp_err,fp_log,step,pme_lb);
 +        }
 +    }
 +
 +    /* Change the Coulomb cut-off and the PME grid */
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +
 +    ic->rcoulomb   = set->rcut_coulomb;
 +    ic->rlist      = set->rlist;
 +    ic->rlistlong  = set->rlistlong;
 +    ir->nstcalclr  = set->nstcalclr;
 +    ic->ewaldcoeff = set->ewaldcoeff;
 +
 +    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
 +    if (pme_lb->cutoff_scheme == ecutsVERLET &&
 +        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +    {
 +        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
 +    }
 +    else
 +    {
 +        init_interaction_const_tables(NULL,ic,bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
 +    {
 +        init_interaction_const_tables(NULL,ic,bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (cr->duty & DUTY_PME)
 +    {
 +        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
 +        {
 +            /* Generate a new PME data structure,
 +             * copying part of the old pointers.
 +             */
 +            gmx_pme_reinit(&set->pmedata,
 +                           cr,pme_lb->setup[0].pmedata,ir,
 +                           set->grid);
 +        }
 +        *pmedata = set->pmedata;
 +    }
 +    else
 +    {
 +        /* Tell our PME-only node to switch grid */
 +        gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
 +    }
 +
 +    if (debug)
 +    {
 +        print_grid(NULL,debug,"","switched to",set,-1);
 +    }
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        print_grid(fp_err,fp_log,"","optimal",set,-1);
 +    }
 +
 +    return TRUE;
 +}
 +
 +void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
 +{
 +    pme_lb->nstage += n;
 +}
 +
 +static int pme_grid_points(const pme_setup_t *setup)
 +{
 +    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
 +}
 +
 +static void print_pme_loadbal_setting(FILE *fplog,
 +                                     char *name,
 +                                     const pme_setup_t *setup)
 +{
 +    fprintf(fplog,
 +            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
 +            name,
 +            setup->rcut_coulomb,setup->rlist,
 +            setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
 +            setup->spacing,1/setup->ewaldcoeff);
 +}
 +
 +static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
 +                                       FILE *fplog)
 +{
 +    double pp_ratio,grid_ratio;
 +
 +    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlistlong,3.0);
 +    grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
 +        (double)pme_grid_points(&pme_lb->setup[0]);
 +
 +    fprintf(fplog,"\n");
 +    fprintf(fplog,"       P P   -   P M E   L O A D   B A L A N C I N G\n");
 +    fprintf(fplog,"\n");
 +    /* Here we only warn when the optimal setting is the last one */
 +    if (pme_lb->elimited != epmelblimNO &&
 +        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
 +    {
 +        fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
 +                pmelblim_str[pme_lb->elimited]);
 +        fprintf(fplog,"       you might not have reached a good load balance.\n");
 +        if (pme_lb->elimited == epmelblimDD)
 +        {
 +            fprintf(fplog,"       Try different mdrun -dd settings or lower the -dds value.\n");
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
 +    fprintf(fplog,"           particle-particle                    PME\n");
 +    fprintf(fplog,"            rcoulomb  rlist            grid      spacing   1/beta\n");
 +    print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
 +    print_pme_loadbal_setting(fplog,"final"  ,&pme_lb->setup[pme_lb->cur]);
 +    fprintf(fplog," cost-ratio           %4.2f             %4.2f\n",
 +            pp_ratio,grid_ratio);
 +    fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
 +    fprintf(fplog,"\n");
 +}
 +
 +void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
 +{
 +    if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
 +    {
 +        print_pme_loadbal_settings(pme_lb,fplog);
 +    }
 +
 +    /* TODO: Here we should free all pointers in pme_lb,
 +     * but as it contains pme data structures,
 +     * we need to first make pme.c free all data.
 +     */
 +}
index fbde22b6e9ac4819f6cd88d392362d13539692ad,0000000000000000000000000000000000000000..a188b8f8ee65fb206936b68e86d2d42e7244b171
mode 100644,000000..100644
--- /dev/null
@@@ -1,1427 -1,0 +1,1427 @@@
-     check_multi_int(fplog,ms,state->natoms,"the number of atoms");
-     check_multi_int(fplog,ms,ir->eI,"the integrator");
-     check_multi_large_int(fplog,ms,ir->init_step+ir->nsteps,"init_step+nsteps");
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "repl_ex.h"
 +#include "network.h"
 +#include "random.h"
 +#include "smalloc.h"
 +#include "physics.h"
 +#include "copyrite.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "names.h"
 +#include "mvdata.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +
 +#define PROBABILITYCUTOFF 100
 +/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
 +
 +enum { ereTEMP, ereLAMBDA, ereENDSINGLE ,ereTL, ereNR };
 +const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
 +/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
 +   it are multiple replica exchange methods */
 +/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
 +   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
 +
 +typedef struct gmx_repl_ex
 +{
 +    int  repl;
 +    int  nrepl;
 +    real temp;
 +    int  type;
 +    real **q;
 +    gmx_bool bNPT;
 +    real *pres;
 +    int  *ind;
 +    int *allswaps;
 +    int  nst;
 +    int nex;
 +    int  seed;
 +    int  nattempt[2];
 +    real *prob_sum;
 +    int  **nmoves;
 +    int  *nexchange;
 +
 +    /* these are helper arrays for replica exchange; allocated here so they
 +       don't have to be allocated each time */
 +    int *destinations;
 +    int **cyclic;
 +    int **order;
 +    int *tmpswap;
 +    gmx_bool *incycle;
 +    gmx_bool *bEx;
 +
 +    /* helper arrays to hold the quantities that are exchanged */
 +    real *prob;
 +    real *Epot;
 +    real *beta;
 +    real *Vol;
 +    real **de;
 +
 +} t_gmx_repl_ex;
 +
 +static gmx_bool repl_quantity(FILE *fplog,const gmx_multisim_t *ms,
 +                              struct gmx_repl_ex *re,int ere,real q)
 +{
 +    real *qall;
 +    gmx_bool bDiff;
 +    int  i,s;
 +
 +    snew(qall,ms->nsim);
 +    qall[re->repl] = q;
 +    gmx_sum_sim(ms->nsim,qall,ms);
 +
 +    bDiff = FALSE;
 +    for (s=1; s<ms->nsim; s++)
 +    {
 +        if (qall[s] != qall[0])
 +        {
 +            bDiff = TRUE;   
 +        }
 +    }
 +
 +    if (bDiff)
 +    {
 +        /* Set the replica exchange type and quantities */
 +        re->type = ere;
 +
 +        snew(re->q[ere],re->nrepl);
 +        for(s=0; s<ms->nsim; s++)
 +        {
 +            re->q[ere][s] = qall[s];
 +        }
 +    }
 +    sfree(qall);
 +    return bDiff;
 +}
 +
 +gmx_repl_ex_t init_replica_exchange(FILE *fplog,
 +                                    const gmx_multisim_t *ms,
 +                                    const t_state *state,
 +                                    const t_inputrec *ir,
 +                                    int nst, int nex, int init_seed)
 +{
 +    real temp,pres;
 +    int  i,j,k;
 +    struct gmx_repl_ex *re;
 +    gmx_bool bTemp;
 +    gmx_bool bLambda=FALSE;
 +
 +    fprintf(fplog,"\nInitializing Replica Exchange\n");
 +
 +    if (ms == NULL || ms->nsim == 1)
 +    {
 +        gmx_fatal(FARGS,"Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
 +    }
 +
 +    snew(re,1);
 +
 +    re->repl     = ms->sim;
 +    re->nrepl    = ms->nsim;
 +    snew(re->q,ereENDSINGLE);
 +
 +    fprintf(fplog,"Repl  There are %d replicas:\n",re->nrepl);
 +
-                           "first exchange step: init_step/-replex");
-     check_multi_int(fplog,ms,ir->etc,"the temperature coupling");
++    check_multi_int(fplog,ms,state->natoms,"the number of atoms",FALSE);
++    check_multi_int(fplog,ms,ir->eI,"the integrator",FALSE);
++    check_multi_large_int(fplog,ms,ir->init_step+ir->nsteps,"init_step+nsteps",FALSE);
 +    check_multi_large_int(fplog,ms,(ir->init_step+nst-1)/nst,
-                     "the number of temperature coupling groups");
-     check_multi_int(fplog,ms,ir->epc,"the pressure coupling");
-     check_multi_int(fplog,ms,ir->efep,"free energy");
-     check_multi_int(fplog,ms,ir->fepvals->n_lambda,"number of lambda states");
++                          "first exchange step: init_step/-replex",FALSE);
++    check_multi_int(fplog,ms,ir->etc,"the temperature coupling",FALSE);
 +    check_multi_int(fplog,ms,ir->opts.ngtc,
++                    "the number of temperature coupling groups",FALSE);
++    check_multi_int(fplog,ms,ir->epc,"the pressure coupling",FALSE);
++    check_multi_int(fplog,ms,ir->efep,"free energy",FALSE);
++    check_multi_int(fplog,ms,ir->fepvals->n_lambda,"number of lambda states",FALSE);
 +
 +    re->temp = ir->opts.ref_t[0];
 +    for(i=1; (i<ir->opts.ngtc); i++)
 +    {
 +        if (ir->opts.ref_t[i] != re->temp)
 +        {
 +            fprintf(fplog,"\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
 +            fprintf(stderr,"\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
 +        }
 +    }
 +
 +    re->type = -1;
 +    bTemp = repl_quantity(fplog,ms,re,ereTEMP,re->temp);
 +    if (ir->efep != efepNO)
 +    {
 +        bLambda = repl_quantity(fplog,ms,re,ereLAMBDA,(real)ir->fepvals->init_fep_state);
 +    }
 +    if (re->type == -1)  /* nothing was assigned */
 +    {
 +        gmx_fatal(FARGS,"The properties of the %d systems are all the same, there is nothing to exchange",re->nrepl);
 +    }
 +    if (bLambda && bTemp) {
 +        re->type = ereTL;
 +    }
 +
 +    if (bTemp)
 +    {
 +        please_cite(fplog,"Sugita1999a");
 +        if (ir->epc != epcNO)
 +        {
 +            re->bNPT = TRUE;
 +            fprintf(fplog,"Repl  Using Constant Pressure REMD.\n");
 +            please_cite(fplog,"Okabe2001a");
 +        }
 +        if (ir->etc == etcBERENDSEN)
 +        {
 +            gmx_fatal(FARGS,"REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
 +                      ETCOUPLTYPE(ir->etc),ETCOUPLTYPE(etcVRESCALE));
 +        }
 +    }
 +    if (bLambda) {
 +        if (ir->fepvals->delta_lambda != 0)   /* check this? */
 +        {
 +            gmx_fatal(FARGS,"delta_lambda is not zero");
 +        }
 +    }
 +    if (re->bNPT)
 +    {
 +        snew(re->pres,re->nrepl);
 +        if (ir->epct == epctSURFACETENSION)
 +        {
 +            pres = ir->ref_p[ZZ][ZZ];
 +        }
 +        else
 +        {
 +            pres = 0;
 +            j = 0;
 +            for(i=0; i<DIM; i++)
 +            {
 +                if (ir->compress[i][i] != 0)
 +                {
 +                    pres += ir->ref_p[i][i];
 +                    j++;
 +                }
 +            }
 +            pres /= j;
 +        }
 +        re->pres[re->repl] = pres;
 +        gmx_sum_sim(re->nrepl,re->pres,ms);
 +    }
 +
 +    /* Make an index for increasing replica order */
 +    /* only makes sense if one or the other is varying, not both!
 +       if both are varying, we trust the order the person gave. */
 +    snew(re->ind,re->nrepl);
 +    for(i=0; i<re->nrepl; i++)
 +    {
 +        re->ind[i] = i;
 +    }
 +
 +    if (re->type<ereENDSINGLE) {
 +
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            for(j=i+1; j<re->nrepl; j++)
 +            {
 +                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
 +                {
 +                    k = re->ind[i];
 +                    re->ind[i] = re->ind[j];
 +                    re->ind[j] = k;
 +                }
 +                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
 +                {
 +                    gmx_fatal(FARGS,"Two replicas have identical %ss",erename[re->type]);
 +                }
 +            }
 +        }
 +    }
 +
 +    /* keep track of all the swaps, starting with the initial placement. */
 +    snew(re->allswaps,re->nrepl);
 +    for(i=0; i<re->nrepl; i++)
 +    {
 +        re->allswaps[i] = re->ind[i];
 +    }
 +
 +    switch (re->type)
 +    {
 +    case ereTEMP:
 +        fprintf(fplog,"\nReplica exchange in temperature\n");
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            fprintf(fplog," %5.1f",re->q[re->type][re->ind[i]]);
 +        }
 +        fprintf(fplog,"\n");
 +        break;
 +    case ereLAMBDA:
 +        fprintf(fplog,"\nReplica exchange in lambda\n");
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            fprintf(fplog," %3d",(int)re->q[re->type][re->ind[i]]);
 +        }
 +        fprintf(fplog,"\n");
 +        break;
 +    case ereTL:
 +        fprintf(fplog,"\nReplica exchange in temperature and lambda state\n");
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            fprintf(fplog," %5.1f",re->q[ereTEMP][re->ind[i]]);
 +        }
 +        fprintf(fplog,"\n");
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            fprintf(fplog," %5d",(int)re->q[ereLAMBDA][re->ind[i]]);
 +        }
 +        fprintf(fplog,"\n");
 +        break;
 +    default:
 +        gmx_incons("Unknown replica exchange quantity");
 +    }
 +    if (re->bNPT)
 +    {
 +        fprintf(fplog,"\nRepl  p");
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            fprintf(fplog," %5.2f",re->pres[re->ind[i]]);
 +        }
 +
 +        for(i=0; i<re->nrepl; i++)
 +        {
 +            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
 +            {
 +                fprintf(fplog,"\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
 +                fprintf(stderr,"\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
 +            }
 +        }
 +    }
 +    re->nst = nst;
 +    if (init_seed == -1)
 +    {
 +        if (MASTERSIM(ms))
 +        {
 +            re->seed = make_seed();
 +        }
 +        else
 +        {
 +            re->seed = 0;
 +        }
 +        gmx_sumi_sim(1,&(re->seed),ms);
 +    }
 +    else
 +    {
 +        re->seed = init_seed;
 +    }
 +    fprintf(fplog,"\nReplica exchange interval: %d\n",re->nst);
 +    fprintf(fplog,"\nReplica random seed: %d\n",re->seed);
 +
 +    re->nattempt[0] = 0;
 +    re->nattempt[1] = 0;
 +
 +    snew(re->prob_sum,re->nrepl);
 +    snew(re->nexchange,re->nrepl);
 +    snew(re->nmoves,re->nrepl);
 +    for (i=0;i<re->nrepl;i++) 
 +    {
 +        snew(re->nmoves[i],re->nrepl);
 +    }
 +    fprintf(fplog,"Replica exchange information below: x=exchange, pr=probability\n");
 +
 +    /* generate space for the helper functions so we don't have to snew each time */
 +
 +    snew(re->destinations,re->nrepl);
 +    snew(re->incycle,re->nrepl);
 +    snew(re->tmpswap,re->nrepl);
 +    snew(re->cyclic,re->nrepl);
 +    snew(re->order,re->nrepl);
 +    for (i=0;i<re->nrepl;i++)
 +    {
 +        snew(re->cyclic[i],re->nrepl);
 +        snew(re->order[i],re->nrepl);
 +    }
 +    /* allocate space for the functions storing the data for the replicas */
 +    /* not all of these arrays needed in all cases, but they don't take
 +       up much space, since the max size is nrepl**2 */
 +    snew(re->prob,re->nrepl);
 +    snew(re->bEx,re->nrepl);
 +    snew(re->beta,re->nrepl);
 +    snew(re->Vol,re->nrepl);
 +    snew(re->Epot,re->nrepl);
 +    snew(re->de,re->nrepl);
 +    for (i=0;i<re->nrepl;i++)
 +    {
 +        snew(re->de[i],re->nrepl);
 +    }
 +    re->nex = nex;
 +    return re;
 +}
 +
 +static void exchange_reals(const gmx_multisim_t *ms,int b,real *v,int n)
 +{
 +    real *buf;
 +    int  i;
 +
 +    if (v)
 +    {
 +        snew(buf,n);
 +#ifdef GMX_MPI
 +        /*
 +          MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
 +          buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
 +          ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +        */
 +        {
 +            MPI_Request mpi_req;
 +
 +            MPI_Isend(v,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
 +                      ms->mpi_comm_masters,&mpi_req);
 +            MPI_Recv(buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
 +                     ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +            MPI_Wait(&mpi_req,MPI_STATUS_IGNORE);
 +        }
 +#endif
 +        for(i=0; i<n; i++)
 +        {
 +            v[i] = buf[i];
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +
 +static void exchange_ints(const gmx_multisim_t *ms,int b,int *v,int n)
 +{
 +  int *buf;
 +  int  i;
 +
 +  if (v) {
 +    snew(buf,n);
 +#ifdef GMX_MPI
 +    /*
 +    MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
 +               buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
 +               ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +    */
 +    {
 +      MPI_Request mpi_req;
 +
 +      MPI_Isend(v,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
 +              ms->mpi_comm_masters,&mpi_req);
 +      MPI_Recv(buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
 +             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +      MPI_Wait(&mpi_req,MPI_STATUS_IGNORE);
 +    }
 +#endif
 +    for(i=0; i<n; i++) 
 +    {
 +        v[i] = buf[i];
 +    }
 +    sfree(buf);
 +  }
 +}
 +
 +static void exchange_doubles(const gmx_multisim_t *ms,int b,double *v,int n)
 +{
 +    double *buf;
 +    int  i;
 +
 +    if (v)
 +    {
 +        snew(buf,n);
 +#ifdef GMX_MPI
 +        /*
 +          MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
 +          buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
 +          ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +        */
 +        {
 +            MPI_Request mpi_req;
 +
 +            MPI_Isend(v,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
 +                      ms->mpi_comm_masters,&mpi_req);
 +            MPI_Recv(buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
 +                     ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +            MPI_Wait(&mpi_req,MPI_STATUS_IGNORE);
 +        }
 +#endif
 +        for(i=0; i<n; i++)
 +        {
 +            v[i] = buf[i];
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void exchange_rvecs(const gmx_multisim_t *ms,int b,rvec *v,int n)
 +{
 +    rvec *buf;
 +    int  i;
 +  
 +    if (v)
 +    {
 +        snew(buf,n);
 +#ifdef GMX_MPI
 +        /*
 +          MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
 +          buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
 +          ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +        */
 +        {
 +            MPI_Request mpi_req;
 +
 +            MPI_Isend(v[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
 +                      ms->mpi_comm_masters,&mpi_req);
 +            MPI_Recv(buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
 +                     ms->mpi_comm_masters,MPI_STATUS_IGNORE);
 +            MPI_Wait(&mpi_req,MPI_STATUS_IGNORE);
 +        }
 +#endif
 +        for(i=0; i<n; i++)
 +        {
 +            copy_rvec(buf[i],v[i]);
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void exchange_state(const gmx_multisim_t *ms,int b,t_state *state)
 +{
 +    /* When t_state changes, this code should be updated. */
 +    int ngtc,nnhpres;
 +    ngtc = state->ngtc * state->nhchainlength;
 +    nnhpres = state->nnhpres* state->nhchainlength;
 +    exchange_rvecs(ms,b,state->box,DIM);
 +    exchange_rvecs(ms,b,state->box_rel,DIM);
 +    exchange_rvecs(ms,b,state->boxv,DIM);
 +    exchange_reals(ms,b,&(state->veta),1);
 +    exchange_reals(ms,b,&(state->vol0),1);
 +    exchange_rvecs(ms,b,state->svir_prev,DIM);
 +    exchange_rvecs(ms,b,state->fvir_prev,DIM);
 +    exchange_rvecs(ms,b,state->pres_prev,DIM);
 +    exchange_doubles(ms,b,state->nosehoover_xi,ngtc);
 +    exchange_doubles(ms,b,state->nosehoover_vxi,ngtc);
 +    exchange_doubles(ms,b,state->nhpres_xi,nnhpres);
 +    exchange_doubles(ms,b,state->nhpres_vxi,nnhpres);
 +    exchange_doubles(ms,b,state->therm_integral,state->ngtc);
 +    exchange_rvecs(ms,b,state->x,state->natoms);
 +    exchange_rvecs(ms,b,state->v,state->natoms);
 +    exchange_rvecs(ms,b,state->sd_X,state->natoms);
 +}
 +
 +static void copy_rvecs(rvec *s,rvec *d,int n)
 +{
 +    int i;
 +
 +    if (d != NULL)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            copy_rvec(s[i],d[i]);
 +        }
 +    }
 +}
 +
 +static void copy_doubles(const double *s,double *d,int n)
 +{
 +    int i;
 +
 +    if (d != NULL)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            d[i] = s[i];
 +        }
 +    }
 +}
 +
 +static void copy_reals(const real *s,real *d,int n)
 +{
 +    int i;
 +
 +    if (d != NULL)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            d[i] = s[i];
 +        }
 +    }
 +}
 +
 +static void copy_ints(const int *s,int *d,int n)
 +{
 +    int i;
 +
 +    if (d != NULL)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            d[i] = s[i];
 +        }
 +    }
 +}
 +
 +#define scopy_rvecs(v,n)   copy_rvecs(state->v,state_local->v,n);
 +#define scopy_doubles(v,n) copy_doubles(state->v,state_local->v,n);
 +#define scopy_reals(v,n) copy_reals(state->v,state_local->v,n);
 +#define scopy_ints(v,n)   copy_ints(state->v,state_local->v,n);
 +
 +static void copy_state_nonatomdata(t_state *state,t_state *state_local)
 +{
 +    /* When t_state changes, this code should be updated. */
 +    int ngtc,nnhpres;
 +    ngtc = state->ngtc * state->nhchainlength;
 +    nnhpres = state->nnhpres* state->nhchainlength;
 +    scopy_rvecs(box,DIM);
 +    scopy_rvecs(box_rel,DIM);
 +    scopy_rvecs(boxv,DIM);
 +    state_local->veta = state->veta;
 +    state_local->vol0 = state->vol0;
 +    scopy_rvecs(svir_prev,DIM);
 +    scopy_rvecs(fvir_prev,DIM);
 +    scopy_rvecs(pres_prev,DIM);
 +    scopy_doubles(nosehoover_xi,ngtc);
 +    scopy_doubles(nosehoover_vxi,ngtc);
 +    scopy_doubles(nhpres_xi,nnhpres);
 +    scopy_doubles(nhpres_vxi,nnhpres);
 +    scopy_doubles(therm_integral,state->ngtc);
 +    scopy_rvecs(x,state->natoms);
 +    scopy_rvecs(v,state->natoms);
 +    scopy_rvecs(sd_X,state->natoms);
 +    copy_ints(&(state->fep_state),&(state_local->fep_state),1);
 +    scopy_reals(lambda,efptNR);
 +}
 +
 +static void scale_velocities(t_state *state,real fac)
 +{
 +    int i;
 +
 +    if (state->v)
 +    {
 +        for(i=0; i<state->natoms; i++)
 +        {
 +            svmul(fac,state->v[i],state->v[i]);
 +        }
 +    }
 +}
 +
 +static void pd_collect_state(const t_commrec *cr,t_state *state)
 +{
 +    int shift;
 +  
 +    if (debug)
 +    {
 +        fprintf(debug,"Collecting state before exchange\n");
 +    }
 +    shift = cr->nnodes - cr->npmenodes - 1;
 +    move_rvecs(cr,FALSE,FALSE,GMX_LEFT,GMX_RIGHT,state->x,NULL,shift,NULL);
 +    if (state->v)
 +    {
 +        move_rvecs(cr,FALSE,FALSE,GMX_LEFT,GMX_RIGHT,state->v,NULL,shift,NULL);
 +    }
 +    if (state->sd_X)
 +    {
 +        move_rvecs(cr,FALSE,FALSE,GMX_LEFT,GMX_RIGHT,state->sd_X,NULL,shift,NULL);
 +    }
 +}
 +
 +static void print_transition_matrix(FILE *fplog,const char *leg,int n,int **nmoves, int *nattempt)
 +{
 +    int i,j,ntot;
 +    float Tprint;
 +
 +    ntot = nattempt[0] + nattempt[1];
 +    fprintf(fplog,"\n");
 +    fprintf(fplog,"Repl");
 +    for (i=0;i<n;i++)
 +    {
 +        fprintf(fplog,"    ");  /* put the title closer to the center */
 +    }
 +    fprintf(fplog,"Empirical Transition Matrix\n");
 +
 +    fprintf(fplog,"Repl");
 +    for (i=0;i<n;i++)
 +    {
 +        fprintf(fplog,"%8d",(i+1));
 +    }
 +    fprintf(fplog,"\n");
 +
 +    for (i=0;i<n;i++)
 +    {
 +        fprintf(fplog,"Repl");
 +        for (j=0;j<n;j++)
 +        {
 +            Tprint = 0.0;
 +            if (nmoves[i][j] > 0)
 +            {
 +                Tprint = nmoves[i][j]/(2.0*ntot);
 +            }
 +            fprintf(fplog,"%8.4f",Tprint);
 +        }
 +        fprintf(fplog,"%3d\n",i);
 +    }
 +}
 +
 +static void print_ind(FILE *fplog,const char *leg,int n,int *ind,gmx_bool *bEx)
 +{
 +    int i;
 +
 +    fprintf(fplog,"Repl %2s %2d",leg,ind[0]);
 +    for(i=1; i<n; i++)
 +    {
 +        fprintf(fplog," %c %2d",(bEx!=0 && bEx[i]) ? 'x' : ' ',ind[i]);
 +    }
 +    fprintf(fplog,"\n");
 +}
 +
 +static void print_allswitchind(FILE *fplog,int n,int *ind,int *pind, int *allswaps, int *tmpswap)
 +{
 +    int i;
 +
 +    for (i=0;i<n;i++)
 +    {
 +        tmpswap[i] = allswaps[i];
 +    }
 +    for (i=0;i<n;i++)
 +    {
 +        allswaps[i] = tmpswap[pind[i]];
 +    }
 +
 +    fprintf(fplog,"\nAccepted Exchanges:   ");
 +    for (i=0;i<n;i++)
 +    {
 +        fprintf(fplog,"%d ",pind[i]);
 +    }
 +    fprintf(fplog,"\n");
 +
 +    /* the "Order After Exchange" is the state label corresponding to the configuration that
 +       started in state listed in order, i.e.
 +
 +       3 0 1 2
 +
 +       means that the:
 +       configuration starting in simulation 3 is now in simulation 0,
 +       configuration starting in simulation 0 is now in simulation 1,
 +       configuration starting in simulation 1 is now in simulation 2,
 +       configuration starting in simulation 2 is now in simulation 3
 +     */
 +    fprintf(fplog,"Order After Exchange: ");
 +    for (i=0;i<n;i++)
 +    {
 +        fprintf(fplog,"%d ",allswaps[i]);
 +    }
 +    fprintf(fplog,"\n\n");
 +}
 +
 +static void print_prob(FILE *fplog,const char *leg,int n,real *prob)
 +{
 +    int  i;
 +    char buf[8];
 +  
 +    fprintf(fplog,"Repl %2s ",leg);
 +    for(i=1; i<n; i++)
 +    {
 +        if (prob[i] >= 0)
 +        {
 +            sprintf(buf,"%4.2f",prob[i]);
 +            fprintf(fplog,"  %3s",buf[0]=='1' ? "1.0" : buf+1);
 +        }
 +        else
 +        {
 +            fprintf(fplog,"     ");
 +        }
 +    }
 +    fprintf(fplog,"\n");
 +}
 +
 +static void print_count(FILE *fplog,const char *leg,int n,int *count)
 +{
 +    int i;
 +
 +    fprintf(fplog,"Repl %2s ",leg);
 +    for(i=1; i<n; i++)
 +    {
 +        fprintf(fplog," %4d",count[i]);
 +    }
 +    fprintf(fplog,"\n");
 +}
 +
 +static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp) {
 +
 +    real ediff,dpV,delta=0;
 +    real *Epot = re->Epot;
 +    real *Vol = re->Vol;
 +    real **de = re->de;
 +    real *beta = re->beta;
 +
 +    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
 +       to the non permuted case */
 +
 +    switch (re->type)
 +    {
 +    case ereTEMP:
 +        /*
 +         * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
 +         */
 +        ediff = Epot[b] - Epot[a];
 +        delta = -(beta[bp] - beta[ap])*ediff;
 +        break;
 +    case ereLAMBDA:
 +        /* two cases:  when we are permuted, and not.  */
 +        /* non-permuted:
 +           ediff =  E_new - E_old
 +                 =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
 +                 =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
 +                 =  de[b][a] + de[a][b] */
 +
 +        /* permuted:
 +           ediff =  E_new - E_old
 +                 =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
 +                 =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
 +                 =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
 +                 =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
 +                 =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
 +        /* but, in the current code implementation, we flip configurations, not indices . . .
 +           So let's examine that.
 +                 =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
 +                 =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
 +                 = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
 +                 So, if we exchange b<=> bp and a<=> ap, we return to the same result.
 +                 So the simple solution is to flip the
 +                 position of perturbed and original indices in the tests.
 +        */
 +
 +        ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
 +        delta = ediff*beta[a]; /* assume all same temperature in this case */
 +        break;
 +    case ereTL:
 +        /* not permuted:  */
 +        /* delta =  reduced E_new - reduced E_old
 +                 =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
 +                 =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
 +                 =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
 +                    [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
 +                 =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
 +                    beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
 +                 =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
 +        /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
 +        /* permuted (big breath!) */
 +        /*   delta =  reduced E_new - reduced E_old
 +                 =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
 +                 =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
 +                 =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
 +                    - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
 +                    - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
 +                 =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
 +                    [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
 +                    + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
 +                 =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
 +                    [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
 +                    + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
 +                 =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
 +                 + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
 +        delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
 +        break;
 +    default:
 +        gmx_incons("Unknown replica exchange quantity");
 +    }
 +    if (bPrint)
 +    {
 +        fprintf(fplog,"Repl %d <-> %d  dE_term = %10.3e (kT)\n",a,b,delta);
 +    }
 +    if (re->bNPT)
 +    {
 +        /* revist the calculation for 5.0.  Might be some improvements. */
 +        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
 +        if (bPrint) 
 +        {
 +            fprintf(fplog,"  dpV = %10.3e  d = %10.3e\nb",dpV,delta + dpV);
 +        }
 +        delta += dpV;
 +    }
 +    return delta;
 +}
 +
 +static void
 +test_for_replica_exchange(FILE *fplog,
 +                          const gmx_multisim_t *ms,
 +                          struct gmx_repl_ex *re,
 +                          gmx_enerdata_t *enerd,
 +                          real vol,
 +                          gmx_large_int_t step,
 +                          real time)
 +{
 +    int  m,i,j,a,b,ap,bp,i0,i1,tmp;
 +    real ediff=0,delta=0,dpV=0;
 +    gmx_bool bPrint,bMultiEx;
 +    gmx_bool *bEx = re->bEx;
 +    real *prob = re->prob;
 +    int *pind = re->destinations; /* permuted index */
 +    gmx_bool bEpot=FALSE;
 +    gmx_bool bDLambda=FALSE;
 +    gmx_bool bVol=FALSE;
 +
 +    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
 +    fprintf(fplog,"Replica exchange at step " gmx_large_int_pfmt " time %g\n",step,time);
 +
 +    if (re->bNPT)
 +    {
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            re->Vol[i] = 0;
 +        }
 +        bVol = TRUE;
 +        re->Vol[re->repl]  = vol;
 +    }
 +    if ((re->type == ereTEMP || re->type == ereTL))
 +    {
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            re->Epot[i] = 0;
 +        }
 +        bEpot = TRUE;
 +        re->Epot[re->repl] = enerd->term[F_EPOT];
 +        /* temperatures of different states*/
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
 +        }
 +    }
 +    else
 +    {
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
 +        }
 +    }
 +    if (re->type == ereLAMBDA || re->type == ereTL)
 +    {
 +        bDLambda = TRUE;
 +        /* lambda differences. */
 +        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
 +           minus the energy of the jth simulation in the jth Hamiltonian */
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            for (j=0;j<re->nrepl;j++)
 +            {
 +                re->de[i][j] = 0;
 +            }
 +        }
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
 +        }
 +    }
 +
 +    /* now actually do the communication */
 +    if (bVol)
 +    {
 +        gmx_sum_sim(re->nrepl,re->Vol,ms);
 +    }
 +    if (bEpot)
 +    {
 +        gmx_sum_sim(re->nrepl,re->Epot,ms);
 +    }
 +    if (bDLambda)
 +    {
 +        for (i=0;i<re->nrepl;i++)
 +        {
 +            gmx_sum_sim(re->nrepl,re->de[i],ms);
 +        }
 +    }
 +
 +    /* make a duplicate set of indices for shuffling */
 +    for(i=0;i<re->nrepl;i++)
 +    {
 +        pind[i] = re->ind[i];
 +    }
 +
 +    if (bMultiEx)
 +    {
 +        /* multiple random switch exchange */
 +        for (i=0;i<re->nex;i++)
 +        {
 +            /* randomly select a pair  */
 +            /* in theory, could reduce this by identifying only which switches had a nonneglibible
 +               probability of occurring (log p > -100) and only operate on those switches */
 +            /* find out which state it is from, and what label that state currently has. Likely
 +               more work that useful. */
 +            i0 = (int)(re->nrepl*rando(&(re->seed)));
 +            i1 = (int)(re->nrepl*rando(&(re->seed)));
 +            if (i0==i1)
 +            {
 +                i--;
 +                continue;  /* self-exchange, back up and do it again */
 +            }
 +
 +            a = re->ind[i0]; /* what are the indices of these states? */
 +            b = re->ind[i1];
 +            ap = pind[i0];
 +            bp = pind[i1];
 +
 +            bPrint = FALSE; /* too noisy */
 +            /* calculate the energy difference */
 +            /* if the code changes to flip the STATES, rather than the configurations,
 +               use the commented version of the code */
 +            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
 +            delta = calc_delta(fplog,bPrint,re,ap,bp,a,b);
 +
 +            /* we actually only use the first space in the prob and bEx array,
 +               since there are actually many switches between pairs. */
 +
 +            if (delta <= 0)
 +            {
 +                /* accepted */
 +                prob[0] = 1;
 +                bEx[0] = TRUE;
 +            }
 +            else
 +            {
 +                if (delta > PROBABILITYCUTOFF)
 +                {
 +                    prob[0] = 0;
 +                }
 +                else
 +                {
 +                    prob[0] = exp(-delta);
 +                }
 +                /* roll a number to determine if accepted */
 +                bEx[0] = (rando(&(re->seed)) < prob[0]);
 +            }
 +            re->prob_sum[0] += prob[0];
 +
 +            if (bEx[0])
 +            {
 +                /* swap the states */
 +                tmp = pind[i0];
 +                pind[i0] = pind[i1];
 +                pind[i1] = tmp;
 +            }
 +        }
 +        re->nattempt[0]++;  /* keep track of total permutation trials here */
 +        print_allswitchind(fplog,re->nrepl,re->ind,pind,re->allswaps,re->tmpswap);
 +    }
 +    else
 +    {
 +        /* standard nearest neighbor replica exchange */
 +        m = (step / re->nst) % 2;
 +        for(i=1; i<re->nrepl; i++)
 +        {
 +            a = re->ind[i-1];
 +            b = re->ind[i];
 +            
 +            bPrint = (re->repl==a || re->repl==b);
 +            if (i % 2 == m)
 +            {
 +                delta = calc_delta(fplog,bPrint,re,a,b,a,b);
 +                if (delta <= 0) {
 +                    /* accepted */
 +                    prob[i] = 1;
 +                    bEx[i] = TRUE;
 +                }
 +                else
 +                {
 +                    if (delta > PROBABILITYCUTOFF)
 +                    {
 +                        prob[i] = 0;
 +                    }
 +                    else
 +                    {
 +                        prob[i] = exp(-delta);
 +                    }
 +                    /* roll a number to determine if accepted */
 +                    bEx[i] = (rando(&(re->seed)) < prob[i]);
 +                }
 +                re->prob_sum[i] += prob[i];
 +
 +                if (bEx[i])
 +                {
 +                    /* swap these two */
 +                    tmp = pind[i-1];
 +                    pind[i-1] = pind[i];
 +                    pind[i] = tmp;
 +                    re->nexchange[i]++;  /* statistics for back compatibility */
 +                }
 +            }
 +            else
 +            {
 +                prob[i] = -1;
 +                bEx[i] = FALSE;
 +            }
 +        }
 +        /* print some statistics */
 +        print_ind(fplog,"ex",re->nrepl,re->ind,bEx);
 +        print_prob(fplog,"pr",re->nrepl,prob);
 +        fprintf(fplog,"\n");
 +        re->nattempt[m]++;
 +    }
 +
 +    /* record which moves were made and accepted */
 +    for (i=0;i<re->nrepl;i++)
 +    {
 +        re->nmoves[re->ind[i]][pind[i]] +=1;
 +        re->nmoves[pind[i]][re->ind[i]] +=1;
 +    }
 +    fflush(fplog); /* make sure we can see what the last exchange was */
 +}
 +
 +static void write_debug_x(t_state *state)
 +{
 +    int i;
 +
 +    if (debug)
 +    {
 +        for(i=0; i<state->natoms; i+=10)
 +        {
 +            fprintf(debug,"dx %5d %10.5f %10.5f %10.5f\n",i,state->x[i][XX],state->x[i][YY],state->x[i][ZZ]);
 +        }
 +    }
 +}
 +
 +static void
 +cyclic_decomposition(FILE *fplog,
 +                     const int *destinations,
 +                     int **cyclic,
 +                     gmx_bool *incycle,
 +                     const int nrepl,
 +                     int *nswap)
 +{
 +
 +    int i,j,c,p;
 +    int maxlen = 1;
 +    for (i=0;i<nrepl;i++)
 +    {
 +        incycle[i] = FALSE;
 +    }
 +    for (i=0;i<nrepl;i++)  /* one cycle for each replica */
 +    {
 +        if (incycle[i])
 +        {
 +            cyclic[i][0] = -1;
 +            continue;
 +        }
 +        cyclic[i][0] = i;
 +        incycle[i] = TRUE;
 +        c = 1;
 +        p = i;
 +        for (j=0;j<nrepl;j++)  /* potentially all cycles are part, but we will break first */
 +        {
 +            p = destinations[p]; /* start permuting */
 +            if (p==i)
 +            {
 +                cyclic[i][c] = -1;
 +                if (c > maxlen)
 +                {
 +                    maxlen = c;
 +                }
 +                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
 +            }
 +            else
 +            {
 +                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
 +                incycle[p] = TRUE;
 +                c++;
 +            }
 +        }
 +    }
 +    *nswap = maxlen - 1;
 +
 +    if (debug)
 +    {
 +        for (i=0;i<nrepl;i++)
 +        {
 +            fprintf(debug,"Cycle %d:",i);
 +            for (j=0;j<nrepl;j++)
 +            {
 +                if (cyclic[i][j] < 0)
 +                {
 +                    break;
 +                }
 +                fprintf(debug,"%2d",cyclic[i][j]);
 +            }
 +            fprintf(debug,"\n");
 +        }
 +        fflush(debug);
 +    }
 +}
 +
 +static void
 +compute_exchange_order(FILE *fplog,
 +                       int **cyclic,
 +                       int **order,
 +                       const int nrepl,
 +                       const int maxswap)
 +{
 +    int i,j;
 +
 +    for (j=0;j<maxswap;j++)
 +    {
 +        for (i=0;i<nrepl;i++)
 +        {
 +            if (cyclic[i][j+1] >= 0)
 +            {
 +                order[cyclic[i][j+1]][j] = cyclic[i][j];
 +                order[cyclic[i][j]][j] = cyclic[i][j+1];
 +            }
 +        }
 +        for (i=0;i<nrepl;i++)
 +        {
 +            if (order[i][j] < 0)
 +            {
 +                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(fplog,"Replica Exchange Order\n");
 +        for (i=0;i<nrepl;i++)
 +        {
 +            fprintf(fplog,"Replica %d:",i);
 +            for (j=0;j<maxswap;j++)
 +            {
 +                if (order[i][j] < 0) break;
 +                fprintf(debug,"%2d",order[i][j]);
 +            }
 +            fprintf(fplog,"\n");
 +        }
 +        fflush(fplog);
 +    }
 +}
 +
 +static void
 +prepare_to_do_exchange(FILE *fplog,
 +                       const int *destinations,
 +                       const int replica_id,
 +                       const int nrepl,
 +                       int *maxswap,
 +                       int **order,
 +                       int **cyclic,
 +                       int *incycle,
 +                       gmx_bool *bThisReplicaExchanged)
 +{
 +    int i,j;
 +    /* Hold the cyclic decomposition of the (multiple) replica
 +     * exchange. */
 +    gmx_bool bAnyReplicaExchanged = FALSE;
 +    *bThisReplicaExchanged = FALSE;
 +
 +    for (i = 0; i < nrepl; i++)
 +    {
 +        if (destinations[i] != i) {
 +            /* only mark as exchanged if the index has been shuffled */
 +            bAnyReplicaExchanged = TRUE;
 +            break;
 +        }
 +    }
 +    if (bAnyReplicaExchanged)
 +    {
 +        /* reinitialize the placeholder arrays */
 +        for (i = 0; i < nrepl; i++)
 +        {
 +            for (j = 0; j < nrepl; j++)
 +            {
 +                cyclic[i][j] = -1;
 +                order[i][j] = -1;
 +            }
 +        }
 +
 +        /* Identify the cyclic decomposition of the permutation (very
 +         * fast if neighbor replica exchange). */
 +        cyclic_decomposition(fplog,destinations,cyclic,incycle,nrepl,maxswap);
 +
 +        /* Now translate the decomposition into a replica exchange
 +         * order at each step. */
 +        compute_exchange_order(fplog,cyclic,order,nrepl,*maxswap);
 +
 +        /* Did this replica do any exchange at any point? */
 +        for (j = 0; j < *maxswap; j++)
 +        {
 +            if (replica_id != order[replica_id][j])
 +            {
 +                *bThisReplicaExchanged = TRUE;
 +                break;
 +            }
 +        }
 +    }
 +}
 +
 +gmx_bool replica_exchange(FILE *fplog,const t_commrec *cr,struct gmx_repl_ex *re,
 +                          t_state *state,gmx_enerdata_t *enerd,
 +                          t_state *state_local,gmx_large_int_t step,real time)
 +{
 +    int i,j;
 +    int replica_id = 0;
 +    int exchange_partner;
 +    int maxswap = 0;
 +    /* Number of rounds of exchanges needed to deal with any multiple
 +     * exchanges. */
 +    /* Where each replica ends up after the exchange attempt(s). */
 +    /* The order in which multiple exchanges will occur. */
 +    gmx_bool bThisReplicaExchanged = FALSE;
 +
 +    if (MASTER(cr))
 +    {
 +        replica_id  = re->repl;
 +        test_for_replica_exchange(fplog,cr->ms,re,enerd,det(state_local->box),step,time);
 +        prepare_to_do_exchange(fplog,re->destinations,replica_id,re->nrepl,&maxswap,
 +                               re->order,re->cyclic,re->incycle,&bThisReplicaExchanged);
 +    }
 +    /* Do intra-simulation broadcast so all processors belonging to
 +     * each simulation know whether they need to participate in
 +     * collecting the state. Otherwise, they might as well get on with
 +     * the next thing to do. */
 +    if (PAR(cr))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Bcast(&bThisReplicaExchanged,sizeof(gmx_bool),MPI_BYTE,MASTERRANK(cr),
 +                  cr->mpi_comm_mygroup);
 +#endif
 +    }
 +
 +    if (bThisReplicaExchanged)
 +    {
 +        /* Exchange the states */
 +
 +        if (PAR(cr))
 +        {
 +            /* Collect the global state on the master node */
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_collect_state(cr->dd,state_local,state);
 +            }
 +            else
 +            {
 +                pd_collect_state(cr,state);
 +            }
 +        }
 +        
 +        if (MASTER(cr))
 +        {
 +            /* There will be only one swap cycle with standard replica
 +             * exchange, but there may be multiple swap cycles if we
 +             * allow multiple swaps. */
 +
 +            for (j = 0; j < maxswap; j++)
 +            {
 +                exchange_partner = re->order[replica_id][j];
 +
 +                if (exchange_partner != replica_id)
 +                {
 +                    /* Exchange the global states between the master nodes */
 +                    if (debug)
 +                    {
 +                        fprintf(debug,"Exchanging %d with %d\n",replica_id,exchange_partner);
 +                    }
 +                    exchange_state(cr->ms,exchange_partner,state);
 +                }
 +            }
 +            /* For temperature-type replica exchange, we need to scale
 +             * the velocities. */
 +            if (re->type == ereTEMP || re->type == ereTL)
 +            {
 +                scale_velocities(state,sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
 +            }
 +
 +        }
 +
 +        /* With domain decomposition the global state is distributed later */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* Copy the global state to the local state data structure */
 +            copy_state_nonatomdata(state,state_local);
 +            
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,FALSE);
 +            }
 +        }
 +    }
 +
 +    return bThisReplicaExchanged;
 +}
 +
 +void print_replica_exchange_statistics(FILE *fplog,struct gmx_repl_ex *re)
 +{
 +    int  i;
 +
 +    fprintf(fplog,"\nReplica exchange statistics\n");
 +
 +    if (re->nex == 0)
 +    {
 +        fprintf(fplog,"Repl  %d attempts, %d odd, %d even\n",
 +                re->nattempt[0]+re->nattempt[1],re->nattempt[1],re->nattempt[0]);
 +
 +        fprintf(fplog,"Repl  average probabilities:\n");
 +        for(i=1; i<re->nrepl; i++)
 +        {
 +            if (re->nattempt[i%2] == 0)
 +            {
 +                re->prob[i] = 0;
 +            }
 +            else
 +            {
 +                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
 +            }
 +        }
 +        print_ind(fplog,"",re->nrepl,re->ind,NULL);
 +        print_prob(fplog,"",re->nrepl,re->prob);
 +
 +        fprintf(fplog,"Repl  number of exchanges:\n");
 +        print_ind(fplog,"",re->nrepl,re->ind,NULL);
 +        print_count(fplog,"",re->nrepl,re->nexchange);
 +
 +        fprintf(fplog,"Repl  average number of exchanges:\n");
 +        for(i=1; i<re->nrepl; i++) 
 +        {
 +            if (re->nattempt[i%2] == 0)
 +            {
 +                re->prob[i] = 0;
 +            }
 +            else
 +            {
 +                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
 +            }
 +        }
 +        print_ind(fplog,"",re->nrepl,re->ind,NULL);
 +        print_prob(fplog,"",re->nrepl,re->prob);
 +
 +        fprintf(fplog,"\n");
 +    }
 +    /* print the transition matrix */
 +    print_transition_matrix(fplog,"",re->nrepl,re->nmoves,re->nattempt);
 +}
index 86db8f722534ce8ec808b8ff59db8c6dde2a3f28,0000000000000000000000000000000000000000..40b48cd6914f1679bfd31959a783fa7cce749660
mode 100644,000000..100644
--- /dev/null
@@@ -1,2002 -1,0 +1,1990 @@@
- #ifdef GMX_OPENMM
- #include "md_openmm.h"
- #endif
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#include <sys/syscall.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +
 +#include "thread_mpi/threads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
- #ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
- const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
- #else
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
- #endif
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
-                                     int cutoff_scheme)
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t *hw_opt;
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    const char *nbpu_opt;
 +    int nsteps_cmdline;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_nex;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                      mc.nbpu_opt,
 +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              const char *nbpu_opt,
 +              int nsteps_cmdline, int nstepout,int resetstep,
 +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 +              real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt=hw_opt;
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nbpu_opt=nbpu_opt;
 +    mda->nsteps_cmdline=nsteps_cmdline;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_nex=repl_ex_nex;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
 +    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
 +                     (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
 +                     mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t *hw_opt,
 +                                        int nthreads_tot,
 +                                        int ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem     = 0x1A;
 +        const int first_model_SandyBridge = 0x2A;
 +        gmx_bool bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 +    int min_atoms_per_mpi_thread;
 +    char *env;
 +    char sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp,t_commrec *cr,
 +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 +{
 +    char *env;
 +    int  nstlist_orig,nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 +    int  i;
 +    t_state state_tmp;
 +    gmx_bool bBox,bDD,bCont;
 +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[]={ 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,nstl_fmt,ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",buf);
 +        }
 +        sscanf(env,"%d",&ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE,&ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc,rlist_max);
 +    }
 +
 +    i = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box,state_tmp.box);
 +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox = TRUE;
 +                bDD  = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig,ir->nstlist,
 +                ir->rlist,rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n\n",buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE *fplog,
 +                                  gmx_hw_info_t *hwinfo,
 +                                  t_commrec *cr,
 +                                  gmx_hw_opt_t *hw_opt,
 +                                  const char *nbpu_opt,
 +                                  t_inputrec *ir,
 +                                  const gmx_mtop_t *mtop,
 +                                  matrix box,
 +                                  gmx_bool *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU,&ls);
 +
 +        calc_verlet_buffer_size(mtop,det(box),ir,
 +                                ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist,rlist_new,
 +                        ls.cluster_size_i,ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog,cr,ir,mtop,box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop,real box_vol)
 +{
 +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE,&ls);
 +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 +                                NULL,&ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +/* Check the process affinity mask. If it is non-zero, something
 + * else has set the affinity, and mdrun should honor that and
 + * not attempt to do its own thread pinning.
 + *
 + * This function should be called twice. Once before the OpenMP
 + * library gets initialized with bAfterOpenMPInit=FALSE (which will
 + * detect affinity set by external tools like taskset), and again
 + * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
 + * (which will detect affinity changes made by the OpenMP library).
 + *
 + * Note that this will only work on Linux, because we use a GNU
 + * feature. */
 +static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
 +                                   gmx_hw_opt_t *hw_opt, int ncpus,
 +                                   gmx_bool bAfterOpenmpInit)
 +{
 +#ifdef HAVE_SCHED_GETAFFINITY
 +    cpu_set_t mask_current;
 +    int       i, ret, cpu_count, cpu_set;
 +    gmx_bool  bAllSet;
 +
 +    assert(hw_opt);
 +    if (!hw_opt->bThreadPinning)
 +    {
 +        /* internal affinity setting is off, don't bother checking process affinity */
 +        return;
 +    }
 +
 +    CPU_ZERO(&mask_current);
 +    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
 +    {
 +        /* failed to query affinity mask, will just return */
 +        if (debug)
 +        {
 +            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
 +        }
 +        return;
 +    }
 +
 +    /* Before proceeding with the actual check, make sure that the number of
 +     * detected CPUs is >= the CPUs in the current set.
 +     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
 +#ifdef CPU_COUNT
 +    if (ncpus < CPU_COUNT(&mask_current))
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
 +                    ncpus, CPU_COUNT(&mask_current));
 +        }
 +        return;
 +    }
 +#endif /* CPU_COUNT */
 +
 +    bAllSet = TRUE;
 +    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
 +    {
 +        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
 +    }
 +
 +    if (!bAllSet)
 +    {
 +        if (!bAfterOpenmpInit)
 +        {
 +            md_print_warn(cr, fplog,
 +                          "%s detected a non-default process affinity, "
 +                          "so it will not attempt to pin its threads", ShortProgram());
 +        }
 +        else
 +        {
 +            md_print_warn(cr, fplog,
 +                          "%s detected a non-default process affinity, "
 +                          "probably set by the OpenMP library, "
 +                          "so it will not attempt to pin its threads", ShortProgram());
 +        }
 +        hw_opt->bThreadPinning = FALSE;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
 +        }
 +    }
 +    else
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Default affinity mask found\n");
 +        }
 +    }
 +#endif /* HAVE_SCHED_GETAFFINITY */
 +}
 +
 +/* Set CPU affinity. Can be important for performance.
 +   On some systems (e.g. Cray) CPU Affinity is set by default.
 +   But default assigning doesn't work (well) with only some ranks
 +   having threads. This causes very low performance.
 +   External tools have cumbersome syntax for setting affinity
 +   in the case that only some ranks have threads.
 +   Thus it is important that GROMACS sets the affinity internally
 +   if only PME is using threads.
 +*/
 +static void set_cpu_affinity(FILE *fplog,
 +                             const t_commrec *cr,
 +                             gmx_hw_opt_t *hw_opt,
 +                             int nthreads_pme,
 +                             const gmx_hw_info_t *hwinfo,
 +                             const t_inputrec *inputrec)
 +{
 +#if defined GMX_THREAD_MPI
 +    /* With the number of TMPI threads equal to the number of cores
 +     * we already pinned in thread-MPI, so don't pin again here.
 +     */
 +    if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
 +    {
 +        return;
 +    }
 +#endif
 +
 +#ifndef __APPLE__
 +    /* If the tMPI thread affinity setting is not supported encourage the user
 +     * to report it as it's either a bug or an exotic platform which we might
 +     * want to support. */
 +    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
 +    {
 +        md_print_warn(NULL, fplog,
 +                      "Can not set thread affinities on the current plarform. On NUMA systems this\n"
 +                      "can cause performance degradation. If you think your platform should support\n"
 +                      "setting affinities, contact the GROMACS developers.");
 +        return;
 +    }
 +#endif /* __APPLE__ */
 +
 +    if (hw_opt->bThreadPinning)
 +    {
 +        int nth_affinity_set, thread_id_node, thread_id,
 +            nthread_local, nthread_node, nthread_hw_max, nphyscore;
 +        int offset;
 +        char *env;
 +
 +        /* threads on this MPI process or TMPI thread */
 +        if (cr->duty & DUTY_PP)
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
 +        }
 +        else
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntPME);
 +        }
 +
 +        /* map the current process to cores */
 +        thread_id_node = 0;
 +        nthread_node = nthread_local;
 +#ifdef GMX_MPI
 +        if (PAR(cr) || MULTISIM(cr))
 +        {
 +            /* We need to determine a scan of the thread counts in this
 +             * compute node.
 +             */
 +            MPI_Comm comm_intra;
 +
 +            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
 +                           &comm_intra);
 +            MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            /* MPI_Scan is inclusive, but here we need exclusive */
 +            thread_id_node -= nthread_local;
 +            /* Get the total number of threads on this physical node */
 +            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            MPI_Comm_free(&comm_intra);
 +        }
 +#endif
 +
 +        offset = 0;
 +        if (hw_opt->core_pinning_offset > 0)
 +        {
 +            offset = hw_opt->core_pinning_offset;
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Applying core pinning offset %d\n", offset);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Applying core pinning offset %d\n", offset);
 +            }
 +        }
 +
 +        /* With Intel Hyper-Threading enabled, we want to pin consecutive
 +         * threads to physical cores when using more threads than physical
 +         * cores or when the user requests so.
 +         */
 +        nthread_hw_max = hwinfo->nthreads_hw_avail;
 +        nphyscore = -1;
 +        if (hw_opt->bPinHyperthreading ||
 +            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
 +        {
 +            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
 +            {
 +                /* We print to stderr on all processes, as we might have
 +                 * different settings on different physical nodes.
 +                 */
 +                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but non-Intel CPU detected (vendor: %s)\n",
 +                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
 +                }
 +                else
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but the CPU detected does not have Intel Hyper-Threading support "
 +                                  "(or it is turned off)\n");
 +                }
 +            }
 +            nphyscore = nthread_hw_max/2;
 +
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +        }
 +
 +        /* Set the per-thread affinity. In order to be able to check the success
 +         * of affinity settings, we will set nth_affinity_set to 1 on threads
 +         * where the affinity setting succeded and to 0 where it failed.
 +         * Reducing these 0/1 values over the threads will give the total number
 +         * of threads on which we succeeded.
 +         */
 +         nth_affinity_set = 0;
 +#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
 +                     reduction(+:nth_affinity_set)
 +        {
 +            int      core;
 +            gmx_bool setaffinity_ret;
 +
 +            thread_id       = gmx_omp_get_thread_num();
 +            thread_id_node += thread_id;
 +            if (nphyscore <= 0)
 +            {
 +                core = offset + thread_id_node;
 +            }
 +            else
 +            {
 +                /* Lock pairs of threads to the same hyperthreaded core */
 +                core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
 +            }
 +
 +            setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
 +
 +            /* store the per-thread success-values of the setaffinity */
 +            nth_affinity_set = (setaffinity_ret == 0);
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
 +                        cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
 +            }
 +        }
 +
 +        if (nth_affinity_set > nthread_local)
 +        {
 +            char msg[STRLEN];
 +
 +            sprintf(msg, "Looks like we have set affinity for more threads than "
 +                    "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
 +            gmx_incons(msg);
 +        }
 +        else
 +        {
 +            /* check & warn if some threads failed to set their affinities */
 +            if (nth_affinity_set != nthread_local)
 +            {
 +                char sbuf1[STRLEN], sbuf2[STRLEN];
 +
 +                /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
 +                sbuf1[0] = sbuf2[0] = '\0';
 +#ifdef GMX_MPI
 +#ifdef GMX_THREAD_MPI
 +                sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
 +#else /* GMX_LIB_MPI */
 +                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
 +#endif
 +#endif /* GMX_MPI */
 +
 +                if (nthread_local > 1)
 +                {
 +                    sprintf(sbuf2, "of %d/%d thread%s ",
 +                            nthread_local - nth_affinity_set, nthread_local,
 +                            (nthread_local - nth_affinity_set) > 1 ? "s" : "");
 +                }
 +
 +                md_print_warn(NULL, fplog,
 +                              "NOTE: %sAffinity setting %sfailed.\n"
 +                              "      This can cause performance degradation!",
 +                              sbuf1, sbuf2);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
-     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
++                                    int cutoff_scheme,
++                                    gmx_bool bIsSimMaster)
 +{
-         check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
++    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 +                
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE *fplog,
 +                                    int nsteps_cmdline,
 +                                    t_inputrec *ir,
 +                                    const t_commrec *cr)
 +{
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
 +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
 +                    nsteps_cmdline);
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int cutoff_scheme; /* The cutoff scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             const char *nbpu_opt,
 +             int nsteps_cmdline, int nstepout,int resetstep,
 +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool   bForceUseGPU,bTryUseGPU;
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_pme=1;
 +    int         nthreads_pp=1;
 +    gmx_membed_t membed=NULL;
 +    gmx_hw_info_t *hwinfo=NULL;
 +    master_inf_t minf={-1,FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    snew(state,1);
 +    if (SIMMASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
 +        }
 +
 +        /* Detect hardware, gather information. With tMPI only thread 0 does it
 +         * and after threads are started broadcasts hwinfo around. */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
 +                                  inputrec,mtop,state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr,fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf),&minf,cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
-     init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
++        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme,SIMMASTER(cr));
 +
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity. Can't do over all
 +         * MPI processes because hwinfo is not available everywhere, but with
 +         * thread-MPI it's needed as pinning might get turned off which needs
 +         * to be known before starting thread-MPI. */
 +        check_cpu_affinity_set(fplog,
 +                               NULL,
 +                               hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nbpu_opt,
 +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Initializing membed");
 +        }
 +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
 +     * to the other threads  -- slightly uncool, but works fine, just need to
 +     * make sure that the data doesn't get freed twice. */
 +    if (cr->nnodes > 1)
 +    {
 +        if (!SIMMASTER(cr))
 +        {
 +            snew(hwinfo, 1);
 +        }
 +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
 +    }
 +#else
 +    if (PAR(cr) && !SIMMASTER(cr))
 +    {
 +        /* now we have inputrec on all nodes, can run the detection */
 +        /* TODO: perhaps it's better to propagate within a node instead? */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +    }
 +
 +    /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
 +    check_cpu_affinity_set(fplog, cr,
 +                           hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
-     if (integrator[inputrec->eI].func == do_md
- #ifdef GMX_OPENMM
-         ||
-         integrator[inputrec->eI].func == do_md_openmm
- #endif
-         )
++    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state, repl_ex_nst > 0);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
 +    /* Initialize per-physical-node MPI process/thread ID and counters. */
 +    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr,fplog,"Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes==1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes==1 ? "process" : "processes"
 +#endif
 +                  );
 +    fflush(stderr);
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is 
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),
 +                      nbpu_opt,
 +                      FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr,FALSE);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +    /* Before setting affinity, check whether the affinity has changed
 +     * - which indicates that probably the OpenMP library has changed it since
 +     * we first checked). */
 +    check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
 +    /* Set the CPU affinity */
 +    set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
++    if (integrator[inputrec->eI].func == do_md)
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               nthreads_pp, 
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (PAR(cr) && SIMMASTER(cr))
 +#endif
 +    {
 +        gmx_hardware_info_free(hwinfo);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}
index 727174b371c2328f8068820254f46f39d21307e2,c35dd896b8668970fef2e874cfd352b05c25638f..41a1eb0b83ece74e32703ff0ef1037965f3677e7
  #include "xvgr.h"
  #include "gmx_ana.h"
  #include "maths.h"
 +#include "string2.h"
+ #include "names.h"
+ #include "mdebin.h"
  
- /* the dhdl.xvg data from a simulation (actually obsolete, but still
-     here for reading the dhdl.xvg file*/
 -/* Suppress Cygwin compiler warnings from using newlib version of
 - * ctype.h */
 -#ifdef GMX_CYGWIN
 -#undef isdigit
 -#endif
 -
+ /* Structure for the names of lambda vector components */
+ typedef struct lambda_components_t
+ {
+     char **names; /* Array of strings with names for the lambda vector
+                      components */
+     int N;              /* The number of components */
+     int Nalloc;         /* The number of allocated components */
+ } lambda_components_t;
+ /* Structure for a lambda vector or a dhdl derivative direction */
+ typedef struct lambda_vec_t
+ {
+     double *val;    /* The lambda vector component values. Only valid if
+                        dhdl == -1 */
+     int dhdl;       /* The coordinate index for the derivative described by this
+                        structure, or -1 */
+     const lambda_components_t *lc; /* the associated lambda_components
+                                       structure */
+     int index;      /* The state number (init-lambda-state) of this lambda
+                        vector, if known. If not, it is set to -1 */
+ } lambda_vec_t;
+ /* the dhdl.xvg data from a simulation */
  typedef struct xvg_t
  {
-     char   *filename;
+     const char   *filename;
      int    ftp;     /* file type */
      int    nset;    /* number of lambdas, including dhdl */
      int *np;        /* number of data points (du or hists) per lambda */
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index dd65ebeccb21d00d17e5d396f22a94a10bda3542,edb5cf471b00f9473e36accca12c3b17aeac57cc..47f123b3d45199da7bb910f787e0dcd91a4c033e
- add_test(NAME TestExec_mdrun-h
-          COMMAND mdrun -h)
- add_dependencies(tests mdrun)
+ #
+ # This file is part of the GROMACS molecular simulation package.
+ #
+ # Copyright (c) 2012, by the GROMACS development team, led by
+ # David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ # others, as listed in the AUTHORS file in the top-level source
+ # directory and at http://www.gromacs.org.
+ #
+ # GROMACS is free software; you can redistribute it and/or
+ # modify it under the terms of the GNU Lesser General Public License
+ # as published by the Free Software Foundation; either version 2.1
+ # of the License, or (at your option) any later version.
+ #
+ # GROMACS is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ # Lesser General Public License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public
+ # License along with GROMACS; if not, see
+ # http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ #
+ # If you want to redistribute modifications to GROMACS, please
+ # consider that scientific software is very special. Version
+ # control is crucial - bugs must be traceable. We will be happy to
+ # consider code for inclusion in the official distribution, but
+ # derived work must not be called official GROMACS. Details are found
+ # in the README & COPYING files - if they are missing, get the
+ # official version at http://www.gromacs.org.
+ #
+ # To help us fund GROMACS development, we humbly ask that you cite
+ # the research papers on the package. Check out http://www.gromacs.org.
+ #
+ set(REGRESSIONTEST_PATH "" CACHE PATH "Directory containing regressiontests")
+ mark_as_advanced(REGRESSIONTEST_PATH)
+ option(REGRESSIONTEST_DOWNLOAD
+     "Automatically download regressiontests. Tests can be run with ctest." no)
+ if(REGRESSIONTEST_DOWNLOAD AND CMAKE_VERSION VERSION_LESS "2.8.2")
+     message(WARNING "REGRESSIONTEST_DOWNLOAD requires cmake >=2.8.2. Please update cmake or manually download the regressiontests.")
+     set(REGRESSIONTEST_DOWNLOAD FALSE CACHE BOOL 
+         "REGRESSIONTEST_DOWNLOAD not supported with cmake ${CMAKE_VERSION}" FORCE)
+ endif()
+ if(REGRESSIONTEST_DOWNLOAD)
+     if("${PROJECT_VERSION}" MATCHES "-dev")
+         set(REGRESSIONTEST_VERSION master)
+     else()
+         set(REGRESSIONTEST_VERSION ${PROJECT_VERSION})
+     endif()
+     set(REGRESSIONTEST_URL
+         http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
+     set(REGRESSIONTEST_FILE "${CMAKE_CURRENT_BINARY_DIR}/regressiontests.tgz")
+     message("Downloading: ${REGRESSIONTEST_URL}")
+     file(DOWNLOAD ${REGRESSIONTEST_URL} "${REGRESSIONTEST_FILE}" SHOW_PROGRESS STATUS status LOG log)
+     list(GET status 0 status_code)
+     list(GET status 1 status_string)
+     
+     if(NOT status_code EQUAL 0)
+         message(FATAL_ERROR "error: downloading '${REGRESSIONTEST_URL}' failed
+ status_code: ${status_code}
+ status_string: ${status_string}
+ log: ${log}")
+     endif()
+     set(REGRESSIONTEST_PATH
+         "${CMAKE_CURRENT_BINARY_DIR}/regressiontests-${REGRESSIONTEST_VERSION}"
+         CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
+     file(REMOVE_RECURSE "${REGRESSIONTEST_PATH}") #delete potential prior folder
+     execute_process(COMMAND ${CMAKE_COMMAND} -E tar xf "${REGRESSIONTEST_FILE}"
+         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+     if(NOT EXISTS ${REGRESSIONTEST_PATH}/gmxtest.pl)
+         message(FATAL_ERROR "Download incorrect. Doesn't contain required gmxtest.pl")
+     endif()
+     set(REGRESSIONTEST_DOWNLOAD OFF CACHE BOOL "Tests already downloaded. Set to yes to download again" FORCE)
+ endif()
+ if(REGRESSIONTEST_PATH AND (GMX_BLUEGENE OR CMAKE_CROSSCOMPILING OR CMAKE_CONFIGURATION_TYPES))
+     #Bluegene requires us to compile both front-end and back-end binaries (single build is insufficient)
+     message(WARNING 
+         "With cross-compiling or multi-configuration generators (e.g. Visual Studio), running regressiontests from build system is not supported. Please run gmxtest.pl directly.")
+     set(REGRESSIONTEST_PATH OFF CACHE BOOL 
+         "With cross-compiling or multi-configuration generators, running regressiontests from build system is not supported." FORCE)
+ endif()
+ if(REGRESSIONTEST_PATH)
+     if(NOT EXISTS ${REGRESSIONTEST_PATH}/gmxtest.pl)
+         message(FATAL_ERROR
+             "REGRESSIONTEST_PATH invalid. The path needs to contain gmxtest.pl.")
+     endif()
+     if(GMX_DOUBLE)
+         list(APPEND ARGS -double)
+     endif()
+     if(GMX_LIB_MPI AND NOT MPIEXEC) #autodetection failed or CC=mpicc was used
+         message(WARNING
+             "Please set MPIEXEC. Otherwise mpirun is assumed for runnings tests.")
+     endif()
+     if(GMX_LIB_MPI)
+         set(GMX_TEST_NUMBER_PROCS 8 CACHE STRING "Number of processors used for testing")
+         mark_as_advanced(GMX_TEST_NUMBER_PROCS)
+         list(APPEND ARGS -np ${GMX_TEST_NUMBER_PROCS})
+         if(MPIEXEC)
+             list(APPEND ARGS -mpirun ${MPIEXEC})
+         endif()
+         #We should use MPIEXEC_NUMPROC_FLAG but gmxtest.pl doesn't let us pass it
+     endif()
+     if(GMX_BINARY_SUFFIX)
+         list(APPEND ARGS -suffix ${GMX_BINARY_SUFFIX})
+     endif()
+     #crosscompile is only used to disable checking whether binaries work
+     #given that we know they are there and that mdrun might not be exectuable
+     #(e.g. Cray) we enable it.
+     list(APPEND ARGS -crosscompile)
+     set(REGRESSIONTEST_EXTRA_ARGS "" CACHE STRING 
+         "Extra arguments passed to gmxtest")
+     mark_as_advanced(REGRESSIONTEST_EXTRA_ARGS)
+     list(APPEND ARGS ${REGRESSIONTEST_EXTRA_ARGS})
+     list(APPEND ARGS -noverbose -nosuffix)
+     if(GMX_NATIVE_WINDOWS)
+         set(PATH_SEPARATOR "\\;")
+         #replacing \ with / shouldn't be neccessary. But otherwise "..\bin\;c:\.."
+         #gets turned into "...\bin\\c:\.." don't know why and don't have a better
+         #workaround. This workaround doesn't hurt.
+         string(REPLACE "\\" "/" PATH "$ENV{PATH}")
+         #protect ; (don't treat as list)
+         string(REPLACE ";" "\\;" PATH "${PATH}")
+     else()
+         set(PATH_SEPARATOR ":")
+         set(PATH "$ENV{PATH}")
+     endif()
 -    foreach(FOLDER kernel tools gmxlib mdlib) #lib folders might be needed for
++    foreach(FOLDER bin lib) #lib folders might be needed for
+         #e.g. DLLs. For GMX paths native ("\") is needed for GMXLIB detection
 -        file(TO_NATIVE_PATH "${CMAKE_BINARY_DIR}/src/${FOLDER}" DIR)
++        file(TO_NATIVE_PATH "${CMAKE_BINARY_DIR}/${FOLDER}" DIR)
+         set(PATH "${DIR}${PATH_SEPARATOR}${PATH}")
+     endforeach()
+     find_program(PERL_EXECUTABLE NAMES "perl")
+     mark_as_advanced(PERL_EXECUTABLE)
+     if (NOT PERL_EXECUTABLE)
+         message(FATAL_ERROR "Perl not found. Install perl, set PERL_EXECUTABLE to the perl location, or unset REGRESSIONTEST_PATH to disable testing.")
+     endif()
+     #currently not testing tools because they don't contain any useful tests
+     foreach(subtest simple complex kernel freeenergy pdb2gmx)
+         add_test(NAME regressiontests/${subtest}
+             #windows requires the command to be perl and not the script
+             COMMAND perl "${REGRESSIONTEST_PATH}/gmxtest.pl" ${subtest} ${ARGS})
+         set_tests_properties(regressiontests/${subtest} PROPERTIES
+             ENVIRONMENT "PATH=${PATH}")
+     endforeach()
+ endif()