Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Thu, 20 Dec 2012 19:08:57 +0000 (14:08 -0500)
committerRoland Schulz <roland@utk.edu>
Thu, 20 Dec 2012 19:09:23 +0000 (14:09 -0500)
Conflicts:
CMakeLists.txt
include/copyrite.h: applied to src/gromacs/gmxlib/copyrite.c
share/template/CMakeLists.txt: reverted
share/template/template.c: reverted
src/config.h.cmakein
src/gmxlib/gpu_utils/CMakeLists.txt: applied to src/gromacs/...
src/gromacs/gmxpreprocess/readir.h
src/gromacs/legacyheaders/types/nb_verlet.h

Removed:
cmake/gmxCheckGCCVersion.cmake

include/gmx_x86_simd_macros.h
src/gromacs/legacyheaders/gmx_x86_simd_macros.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
src/gromacs/mdlib/nbnxn_search_x86_simd.h
src/mdlib/nbnxn_search_x86_simd.h

Moved:
include/gmx_simd_macros.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
src/mdlib/nbnxn_search_simd_4xn.h

Change-Id: I753f701d74b8d4533e74cede19712b030bf97ca6

36 files changed:
1  2 
CMakeLists.txt
cmake/gmxBuildTypeReference.cmake
cmake/gmxManageMPI.cmake
src/config.h.cmakein
src/gromacs/gmxlib/copyrite.c
src/gromacs/gmxlib/gmx_detect_hardware.c
src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
src/gromacs/gmxlib/gpu_utils/dummy.cpp
src/gromacs/gmxpreprocess/calc_verletbuf.c
src/gromacs/gmxpreprocess/readir.h
src/gromacs/gmxpreprocess/readpull.c
src/gromacs/legacyheaders/gmx_simd_macros.h
src/gromacs/legacyheaders/gmx_x86_avx_256.h
src/gromacs/legacyheaders/types/nb_verlet.h
src/gromacs/mdlib/clincs.c
src/gromacs/mdlib/fft5d.cpp
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/nbnxn_atomdata.c
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
src/gromacs/mdlib/nbnxn_search_simd_4xn.h
src/gromacs/mdlib/sim_util.c
src/programs/grompp/grompp.c
src/programs/mdrun/pme_loadbal.c

diff --cc CMakeLists.txt
index 6c53546c67c0b91cdd17c98d69c434f156c83005,61e5fefaf37be55d572d3677d3c83a80becac415..ae3d23362f1309761ac092ded1cc76a34cf556bd
@@@ -671,14 -700,16 +675,14 @@@ if(${GMX_CPU_ACCELERATION} STREQUAL "NO
  elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
  
      GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
-     if(NOT GNU_SSE2_CFLAG)
+     if(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
          GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
-     endif(NOT GNU_SSE2_CFLAG)
+     endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
 -        if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
 -        endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" GROMACS_CXX_FLAGS)
-     if(NOT GNU_SSE2_CXXFLAG)
++    if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
-     endif(NOT GNU_SSE2_CXXFLAG)
++    endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
  
      # We dont warn for lacking SSE2 flag support, since that is probably standard today.
  
@@@ -706,19 -737,25 +710,23 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
          # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
          # intrinsics when SSE2 support is enabled, so we try that instead.
-         GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+       if (GMX_NATIVE_WINDOWS)
+             GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
+         endif()
      endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
 -        if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
 -        endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG) 
 -            message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 -            # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 -            # intrinsics when SSE2 support is enabled, so we try that instead.
 -            if (GMX_NATIVE_WINDOWS)
 -                GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
 -            endif()
 -        endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
-     if (NOT GNU_SSE4_CXXFLAG)
++    if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +       GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
-     endif(NOT GNU_SSE4_CXXFLAG)
++    endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 +        message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 +        # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 +        # intrinsics when SSE2 support is enabled, so we try that instead.
-         GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
++        if (GMX_NATIVE_WINDOWS)
++            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
++        endif()
 +    endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
  
      # This must come after we have added the -msse4.1 flag on some platforms.
      check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
@@@ -747,13 -784,15 +755,13 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
      endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
 -        if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
 -        endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -            message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
 -        endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" GROMACS_CXX_FLAGS)
-     if (NOT GNU_AVX_CXXFLAG)
++    if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +       GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" GROMACS_CXX_FLAGS)
-     endif (NOT GNU_AVX_CXXFLAG)
++    endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 +       message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
 +    endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
  
      # Set the FMA4 flags (MSVC doesn't require any)
      if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
Simple merge
Simple merge
Simple merge
index fa371bb3e320d387682c29980804052fc31746e9,0000000000000000000000000000000000000000..20b96fe8ac18b17671c89470691fde189562d0e7
mode 100644,000000..100644
--- /dev/null
@@@ -1,758 -1,0 +1,759 @@@
-   static const char * GPLText[] = {
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +#ifdef HAVE_LIBMKL
 +#include <mkl.h>
 +#endif
 +#ifdef GMX_GPU
 +#include <cuda.h>
 +#include <cuda_runtime_api.h>
 +#endif
 +#ifdef GMX_FFT_FFTW3
 +#include <fftw3.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +
 +#include <string.h>
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "macros.h"
 +#include <time.h>
 +#include "random.h"
 +#include "statutil.h"
 +#include "copyrite.h"
 +#include "strdb.h"
 +#include "futil.h"
 +#include "vec.h"
 +#include "buildinfo.h"
 +#include "gmx_cpuid.h"
 +
 +static void pr_two(FILE *out,int c,int i)
 +{
 +  if (i < 10)
 +    fprintf(out,"%c0%1d",c,i);
 +  else
 +    fprintf(out,"%c%2d",c,i);
 +}
 +
 +void pr_difftime(FILE *out,double dt)
 +{
 +  int    ndays,nhours,nmins,nsecs;
 +  gmx_bool   bPrint,bPrinted;
 +
 +  ndays = dt/(24*3600);
 +  dt    = dt-24*3600*ndays;
 +  nhours= dt/3600;
 +  dt    = dt-3600*nhours;
 +  nmins = dt/60;
 +  dt    = dt-nmins*60;
 +  nsecs = dt;
 +  bPrint= (ndays > 0);
 +  bPrinted=bPrint;
 +  if (bPrint) 
 +    fprintf(out,"%d",ndays);
 +  bPrint=bPrint || (nhours > 0);
 +  if (bPrint) {
 +    if (bPrinted)
 +      pr_two(out,'d',nhours);
 +    else 
 +      fprintf(out,"%d",nhours);
 +  }
 +  bPrinted=bPrinted || bPrint;
 +  bPrint=bPrint || (nmins > 0);
 +  if (bPrint) {
 +    if (bPrinted)
 +      pr_two(out,'h',nmins);
 +    else 
 +      fprintf(out,"%d",nmins);
 +  }
 +  bPrinted=bPrinted || bPrint;
 +  if (bPrinted)
 +    pr_two(out,':',nsecs);
 +  else
 +    fprintf(out,"%ds",nsecs);
 +  fprintf(out,"\n");
 +}
 +
 +
 +gmx_bool be_cool(void)
 +{
 +  /* Yes, it is bad to check the environment variable every call,
 +   * but we dont call this routine often, and it avoids using 
 +   * a mutex for locking the variable...
 +   */
 +#ifdef GMX_FAHCORE
 +  /*be uncool*/
 +  return FALSE;
 +#else
 +  return (getenv("GMX_NO_QUOTES") == NULL);
 +#endif
 +}
 +
 +void space(FILE *out, int n)
 +{
 +  fprintf(out,"%*s",n,"");
 +}
 +
 +void f(char *a)
 +{
 +    int i;
 +    int len=strlen(a);
 +    
 +    for(i=0;i<len;i++)
 +        a[i]=~a[i]; 
 +}
 +
 +static void sp_print(FILE *out,const char *s)
 +{
 +  int slen;
 +  
 +  slen=strlen(s);
 +  space(out,(80-slen)/2);
 +  fprintf(out,"%s\n",s);
 +}
 +
 +static void ster_print(FILE *out,const char *s)
 +{
 +  int  slen;
 +  char buf[128];
 +  
 +  snprintf(buf,128,":-)  %s  (-:",s);
 +  slen=strlen(buf);
 +  space(out,(80-slen)/2);
 +  fprintf(out,"%s\n",buf);
 +}
 +
 +
 +static void pukeit(const char *db,const char *defstring, char *retstring, 
 +                 int retsize, int *cqnum)
 +{
 +  FILE *fp;
 +  char **help;
 +  int  i,nhlp;
 +  int  seed;
 + 
 +  if (be_cool() && ((fp = low_libopen(db,FALSE)) != NULL)) {
 +    nhlp=fget_lines(fp,&help);
 +    /* for libraries we can use the low-level close routines */
 +    ffclose(fp);
 +    seed=time(NULL);
 +    *cqnum=nhlp*rando(&seed);
 +    if (strlen(help[*cqnum]) >= STRLEN)
 +      help[*cqnum][STRLEN-1] = '\0';
 +    strncpy(retstring,help[*cqnum],retsize);
 +    f(retstring);
 +    for(i=0; (i<nhlp); i++)
 +      sfree(help[i]);
 +    sfree(help);
 +  }
 +  else 
 +    strncpy(retstring,defstring,retsize);
 +}
 +
 +void bromacs(char *retstring, int retsize)
 +{
 +  int dum;
 +
 +  pukeit("bromacs.dat",
 +       "Groningen Machine for Chemical Simulation",
 +       retstring,retsize,&dum);
 +}
 +
 +void cool_quote(char *retstring, int retsize, int *cqnum)
 +{
 +  char *tmpstr;
 +  char *s,*ptr;
 +  int tmpcq,*p;
 +  
 +  if (cqnum!=NULL)
 +    p = cqnum;
 +  else
 +    p = &tmpcq;
 +  
 +  /* protect audience from explicit lyrics */
 +  snew(tmpstr,retsize+1);
 +  pukeit("gurgle.dat","Thanx for Using GROMACS - Have a Nice Day",
 +       tmpstr,retsize-2,p);
 +
 +  if ((ptr = strchr(tmpstr,'_')) != NULL) {
 +    *ptr='\0';
 +    ptr++;
 +    sprintf(retstring,"\"%s\" %s",tmpstr,ptr);
 +  }
 +  else {
 +    strcpy(retstring,tmpstr);
 +  }
 +  sfree(tmpstr);
 +}
 +
 +void CopyRight(FILE *out,const char *szProgram)
 +{
 +  static const char * CopyrightText[] = {
 +             "Written by Emile Apol, Rossen Apostolov, Herman J.C. Berendsen,",
 +             "Aldert van Buuren, Pär Bjelkmar, Rudi van Drunen, Anton Feenstra, ",
 +             "Gerrit Groenhof, Peter Kasson, Per Larsson, Pieter Meulenhoff, ",
 +             "Teemu Murtola, Szilard Pall, Sander Pronk, Roland Schulz, ",
 +             "Michael Shirts, Alfons Sijbers, Peter Tieleman,\n",
 +             "Berk Hess, David van der Spoel, and Erik Lindahl.\n",
 +             "Copyright (c) 1991-2000, University of Groningen, The Netherlands.",
 +             "Copyright (c) 2001-2010, The GROMACS development team at",
 +             "Uppsala University & The Royal Institute of Technology, Sweden.",
 +             "check out http://www.gromacs.org for more information.\n"
 +  };
 +
-               "modify it under the terms of the GNU General Public License",
-               "as published by the Free Software Foundation; either version 2",
++  static const char * LicenseText[] = {
 +              "This program is free software; you can redistribute it and/or",
- #define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
++              "modify it under the terms of the GNU Lesser General Public License",
++              "as published by the Free Software Foundation; either version 2.1",
 +              "of the License, or (at your option) any later version."
 +  };
 +
 +  /* Dont change szProgram arbitrarily - it must be argv[0], i.e. the 
 +   * name of a file. Otherwise, we won't be able to find the library dir.
 +   */
 +#define NCR (int)asize(CopyrightText)
++/* TODO: Is this exception still needed? */
 +#ifdef GMX_FAHCORE
- #define NGPL (int)asize(GPLText)
++#define NLICENSE 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
 +#else
-   for(i=0; (i<NGPL); i++)
-     sp_print(out,GPLText[i]);
++#define NLICENSE (int)asize(LicenseText)
 +#endif
 +
 +  char buf[256],tmpstr[1024];
 +  int i;
 +
 +#ifdef GMX_FAHCORE
 +  set_program_name("Gromacs");
 +#else
 +  set_program_name(szProgram);
 +#endif
 +
 +  ster_print(out,"G  R  O  M  A  C  S");
 +  fprintf(out,"\n");
 +  
 +  bromacs(tmpstr,1023);
 +  sp_print(out,tmpstr); 
 +  fprintf(out,"\n");
 +
 +  ster_print(out,GromacsVersion());
 +  fprintf(out,"\n");
 +
 +  /* fprintf(out,"\n");*/
 +
 +  /* sp_print(out,"PLEASE NOTE: THIS IS A BETA VERSION\n");
 +  
 +  fprintf(out,"\n"); */
 +
 +  for(i=0; (i<NCR); i++) 
 +    sp_print(out,CopyrightText[i]);
++  for(i=0; (i<NLICENSE); i++)
++    sp_print(out,LicenseText[i]);
 +
 +  fprintf(out,"\n");
 +
 +  snprintf(buf,256,"%s",Program());
 +#ifdef GMX_DOUBLE
 +  strcat(buf," (double precision)");
 +#endif
 +  ster_print(out,buf);
 +  fprintf(out,"\n");
 +}
 +
 +
 +void thanx(FILE *fp)
 +{
 +  char cq[1024];
 +  int  cqnum;
 +
 +  /* protect the audience from suggestive discussions */
 +  cool_quote(cq,1023,&cqnum);
 +  
 +  if (be_cool()) 
 +    fprintf(fp,"\ngcq#%d: %s\n\n",cqnum,cq);
 +  else
 +    fprintf(fp,"\n%s\n\n",cq);
 +}
 +
 +typedef struct {
 +  const char *key;
 +  const char *author;
 +  const char *title;
 +  const char *journal;
 +  int volume,year;
 +  const char *pages;
 +} t_citerec;
 +
 +void please_cite(FILE *fp,const char *key)
 +{
 +  static const t_citerec citedb[] = {
 +    { "Allen1987a",
 +      "M. P. Allen and D. J. Tildesley",
 +      "Computer simulation of liquids",
 +      "Oxford Science Publications",
 +      1, 1987, "1" },
 +    { "Berendsen95a",
 +      "H. J. C. Berendsen, D. van der Spoel and R. van Drunen",
 +      "GROMACS: A message-passing parallel molecular dynamics implementation",
 +      "Comp. Phys. Comm.",
 +      91, 1995, "43-56" },
 +    { "Berendsen84a",
 +      "H. J. C. Berendsen, J. P. M. Postma, A. DiNola and J. R. Haak",
 +      "Molecular dynamics with coupling to an external bath",
 +      "J. Chem. Phys.",
 +      81, 1984, "3684-3690" },
 +    { "Ryckaert77a",
 +      "J. P. Ryckaert and G. Ciccotti and H. J. C. Berendsen",
 +      "Numerical Integration of the Cartesian Equations of Motion of a System with Constraints; Molecular Dynamics of n-Alkanes",
 +      "J. Comp. Phys.",
 +      23, 1977, "327-341" },
 +    { "Miyamoto92a",
 +      "S. Miyamoto and P. A. Kollman",
 +      "SETTLE: An Analytical Version of the SHAKE and RATTLE Algorithms for Rigid Water Models",
 +      "J. Comp. Chem.",
 +      13, 1992, "952-962" },
 +    { "Cromer1968a",
 +      "D. T. Cromer & J. B. Mann",
 +      "X-ray scattering factors computed from numerical Hartree-Fock wave functions",
 +      "Acta Cryst. A",
 +      24, 1968, "321" },
 +    { "Barth95a",
 +      "E. Barth and K. Kuczera and B. Leimkuhler and R. D. Skeel",
 +      "Algorithms for Constrained Molecular Dynamics",
 +      "J. Comp. Chem.",
 +      16, 1995, "1192-1209" },
 +    { "Essmann95a",
 +      "U. Essmann, L. Perera, M. L. Berkowitz, T. Darden, H. Lee and L. G. Pedersen ",
 +      "A smooth particle mesh Ewald method",
 +      "J. Chem. Phys.",
 +      103, 1995, "8577-8592" },
 +    { "Torda89a",
 +      "A. E. Torda and R. M. Scheek and W. F. van Gunsteren",
 +      "Time-dependent distance restraints in molecular dynamics simulations",
 +      "Chem. Phys. Lett.",
 +      157, 1989, "289-294" },
 +    { "Tironi95a",
 +      "I. G. Tironi and R. Sperb and P. E. Smith and W. F. van Gunsteren",
 +      "Generalized reaction field method for molecular dynamics simulations",
 +      "J. Chem. Phys",
 +      102, 1995, "5451-5459" },
 +    { "Hess97a",
 +      "B. Hess and H. Bekker and H. J. C. Berendsen and J. G. E. M. Fraaije",
 +      "LINCS: A Linear Constraint Solver for molecular simulations",
 +      "J. Comp. Chem.",
 +      18, 1997, "1463-1472" },
 +    { "Hess2008a",
 +      "B. Hess",
 +      "P-LINCS: A Parallel Linear Constraint Solver for molecular simulation",
 +      "J. Chem. Theory Comput.",
 +      4, 2008, "116-122" },
 +    { "Hess2008b",
 +      "B. Hess and C. Kutzner and D. van der Spoel and E. Lindahl",
 +      "GROMACS 4: Algorithms for highly efficient, load-balanced, and scalable molecular simulation",
 +      "J. Chem. Theory Comput.",
 +      4, 2008, "435-447" },
 +    { "Hub2010",
 +      "J. S. Hub, B. L. de Groot and D. van der Spoel",
 +      "g_wham - A free weighted histogram analysis implementation including robust error and autocorrelation estimates",
 +      "J. Chem. Theory Comput.",
 +      6, 2010, "3713-3720"}, 
 +    { "In-Chul99a",
 +      "Y. In-Chul and M. L. Berkowitz",
 +      "Ewald summation for systems with slab geometry",
 +      "J. Chem. Phys.",
 +      111, 1999, "3155-3162" },
 +    { "DeGroot97a",
 +      "B. L. de Groot and D. M. F. van Aalten and R. M. Scheek and A. Amadei and G. Vriend and H. J. C. Berendsen",
 +      "Prediction of Protein Conformational Freedom From Distance Constrains",
 +      "Proteins",
 +      29, 1997, "240-251" },
 +    { "Spoel98a",
 +      "D. van der Spoel and P. J. van Maaren and H. J. C. Berendsen",
 +      "A systematic study of water models for molecular simulation. Derivation of models optimized for use with a reaction-field.",
 +      "J. Chem. Phys.",
 +      108, 1998, "10220-10230" },
 +    { "Wishart98a",
 +      "D. S. Wishart and A. M. Nip",
 +      "Protein Chemical Shift Analysis: A Practical Guide",
 +      "Biochem. Cell Biol.",
 +      76, 1998, "153-163" },
 +    { "Maiorov95",
 +      "V. N. Maiorov and G. M. Crippen",
 +      "Size-Independent Comparison of Protein Three-Dimensional Structures",
 +      "PROTEINS: Struct. Funct. Gen.",
 +      22, 1995, "273-283" },
 +    { "Feenstra99",
 +      "K. A. Feenstra and B. Hess and H. J. C. Berendsen",
 +      "Improving Efficiency of Large Time-scale Molecular Dynamics Simulations of Hydrogen-rich Systems",
 +      "J. Comput. Chem.",
 +      20, 1999, "786-798" },
 +    { "Timneanu2004a",
 +      "N. Timneanu and C. Caleman and J. Hajdu and D. van der Spoel",
 +      "Auger Electron Cascades in Water and Ice",
 +      "Chem. Phys.",
 +      299, 2004, "277-283" },
 +    { "Pascal2011a",
 +      "T. A. Pascal and S. T. Lin and W. A. Goddard III",
 +      "Thermodynamics of liquids: standard molar entropies and heat capacities of common solvents from 2PT molecular dynamics",
 +      "Phys. Chem. Chem. Phys.",
 +      13, 2011, "169-181" },
 +    { "Caleman2011b",
 +      "C. Caleman and P. J. van Maaren and M. Hong and J. S. Hub and L. T. da Costa and D. van der Spoel",
 +      "Force Field Benchmark of Organic Liquids: Density, Enthalpy of Vaporization, Heat Capacities, Surface Tension, Isothermal Compressibility, Volumetric Expansion Coefficient, and Dielectric Constant",
 +      "J. Chem. Theo. Comp.",
 +      8, 2012, "61" },
 +    { "Lindahl2001a",
 +      "E. Lindahl and B. Hess and D. van der Spoel",
 +      "GROMACS 3.0: A package for molecular simulation and trajectory analysis",
 +      "J. Mol. Mod.",
 +      7, 2001, "306-317" },
 +    { "Wang2001a",
 +      "J. Wang and W. Wang and S. Huo and M. Lee and P. A. Kollman",
 +      "Solvation model based on weighted solvent accessible surface area",
 +      "J. Phys. Chem. B",
 +      105, 2001, "5055-5067" },
 +    { "Eisenberg86a",
 +      "D. Eisenberg and A. D. McLachlan",
 +      "Solvation energy in protein folding and binding",
 +      "Nature",
 +      319, 1986, "199-203" },
 +    { "Eisenhaber95",
 +      "Frank Eisenhaber and Philip Lijnzaad and Patrick Argos and Chris Sander and Michael Scharf",
 +      "The Double Cube Lattice Method: Efficient Approaches to Numerical Integration of Surface Area and Volume and to Dot Surface Contouring of Molecular Assemblies",
 +      "J. Comp. Chem.",
 +      16, 1995, "273-284" },
 +    { "Hess2002",
 +      "B. Hess, H. Saint-Martin and H.J.C. Berendsen",
 +      "Flexible constraints: an adiabatic treatment of quantum degrees of freedom, with application to the flexible and polarizable MCDHO model for water",
 +      "J. Chem. Phys.",
 +      116, 2002, "9602-9610" },
 +    { "Hetenyi2002b",
 +      "Csaba Hetenyi and David van der Spoel",
 +      "Efficient docking of peptides to proteins without prior knowledge of the binding site.",
 +      "Prot. Sci.",
 +      11, 2002, "1729-1737" },
 +    { "Hess2003",
 +      "B. Hess and R.M. Scheek",
 +      "Orientation restraints in molecular dynamics simulations using time and ensemble averaging",
 +      "J. Magn. Res.",
 +      164, 2003, "19-27" },
 +    { "Rappe1991a",
 +      "A. K. Rappe and W. A. Goddard III",
 +      "Charge Equillibration for Molecular Dynamics Simulations",
 +      "J. Phys. Chem.",
 +      95, 1991, "3358-3363" },
 +    { "Mu2005a",
 +      "Y. Mu, P. H. Nguyen and G. Stock",
 +      "Energy landscape of a small peptide revelaed by dihedral angle principal component analysis",
 +      "Prot. Struct. Funct. Bioinf.",
 +      58, 2005, "45-52" },
 +    { "Okabe2001a",
 +      "T. Okabe and M. Kawata and Y. Okamoto and M. Mikami",
 +      "Replica-exchange {M}onte {C}arlo method for the isobaric-isothermal ensemble",
 +      "Chem. Phys. Lett.",
 +      335, 2001, "435-439" },
 +    { "Hukushima96a",
 +      "K. Hukushima and K. Nemoto",
 +      "Exchange Monte Carlo Method and Application to Spin Glass Simulations",
 +      "J. Phys. Soc. Jpn.",
 +      65, 1996, "1604-1608" },
 +    { "Tropp80a",
 +      "J. Tropp",
 +      "Dipolar Relaxation and Nuclear Overhauser effects in nonrigid molecules: The effect of fluctuating internuclear distances",
 +      "J. Chem. Phys.",
 +      72, 1980, "6035-6043" },
 +    { "Bultinck2002a",
 +       "P. Bultinck and W. Langenaeker and P. Lahorte and F. De Proft and P. Geerlings and M. Waroquier and J. P. Tollenaere",
 +      "The electronegativity equalization method I: Parametrization and validation for atomic charge calculations",
 +      "J. Phys. Chem. A",
 +      106, 2002, "7887-7894" },
 +    { "Yang2006b",
 +      "Q. Y. Yang and K. A. Sharp",
 +      "Atomic charge parameters for the finite difference Poisson-Boltzmann method using electronegativity neutralization",
 +      "J. Chem. Theory Comput.",
 +      2, 2006, "1152-1167" },
 +    { "Spoel2005a",
 +      "D. van der Spoel, E. Lindahl, B. Hess, G. Groenhof, A. E. Mark and H. J. C. Berendsen",
 +      "GROMACS: Fast, Flexible and Free",
 +      "J. Comp. Chem.",
 +      26, 2005, "1701-1719" },
 +    { "Spoel2006b",
 +      "D. van der Spoel, P. J. van Maaren, P. Larsson and N. Timneanu",
 +      "Thermodynamics of hydrogen bonding in hydrophilic and hydrophobic media",
 +      "J. Phys. Chem. B",
 +      110, 2006, "4393-4398" },
 +    { "Spoel2006d",
 +      "D. van der Spoel and M. M. Seibert",
 +      "Protein folding kinetics and thermodynamics from atomistic simulations",
 +      "Phys. Rev. Letters",
 +      96, 2006, "238102" },
 +    { "Palmer94a",
 +      "B. J. Palmer",
 +      "Transverse-current autocorrelation-function calculations of the shear viscosity for molecular liquids",
 +      "Phys. Rev. E",
 +      49, 1994, "359-366" },
 +    { "Bussi2007a",
 +      "G. Bussi, D. Donadio and M. Parrinello",
 +      "Canonical sampling through velocity rescaling",
 +      "J. Chem. Phys.",
 +      126, 2007, "014101" },
 +    { "Hub2006",
 +      "J. S. Hub and B. L. de Groot",
 +      "Does CO2 permeate through Aquaporin-1?",
 +      "Biophys. J.",
 +      91, 2006, "842-848" },
 +    { "Hub2008",
 +      "J. S. Hub and B. L. de Groot",
 +      "Mechanism of selectivity in aquaporins and aquaglyceroporins",
 +      "PNAS",
 +      105, 2008, "1198-1203" },
 +    { "Friedrich2009",
 +      "M. S. Friedrichs, P. Eastman, V. Vaidyanathan, M. Houston, S. LeGrand, A. L. Beberg, D. L. Ensign, C. M. Bruns, and V. S. Pande",
 +      "Accelerating Molecular Dynamic Simulation on Graphics Processing Units",
 +      "J. Comp. Chem.",
 +      30, 2009, "864-872" },
 +    { "Engin2010",
 +      "O. Engin, A. Villa, M. Sayar and B. Hess",
 +      "Driving Forces for Adsorption of Amphiphilic Peptides to Air-Water Interface",
 +      "J. Phys. Chem. B",
 +      114, 2010, "11093" },
 +    { "Fritsch12",
 +      "S. Fritsch, C. Junghans and K. Kremer",
 +      "Adaptive molecular simulation study on structure formation of toluene around C60 using Gromacs",
 +      "J. Chem. Theo. Comp.",
 +      8, 2012, "398" },
 +    { "Junghans10",
 +      "C. Junghans and S. Poblete",
 +      "A reference implementation of the adaptive resolution scheme in ESPResSo",
 +      "Comp. Phys. Comm.",
 +      181, 2010, "1449" },
 +    { "Wang2010",
 +      "H. Wang, F. Dommert, C.Holm",
 +      "Optimizing working parameters of the smooth particle mesh Ewald algorithm in terms of accuracy and efficiency",
 +      "J. Chem. Phys. B",
 +      133, 2010, "034117" },
 +    { "Sugita1999a",
 +      "Y. Sugita, Y. Okamoto",
 +      "Replica-exchange molecular dynamics method for protein folding",
 +      "Chem. Phys. Lett.",
 +      314, 1999, "141-151" },
 +    { "Kutzner2011",
 +      "C. Kutzner and J. Czub and H. Grubmuller",
 +      "Keep it Flexible: Driving Macromolecular Rotary Motions in Atomistic Simulations with GROMACS",
 +      "J. Chem. Theory Comput.",
 +      7, 2011, "1381-1393" },
 +    { "Hoefling2011",
 +      "M. Hoefling, N. Lima, D. Haenni, C.A.M. Seidel, B. Schuler, H. Grubmuller",
 +      "Structural Heterogeneity and Quantitative FRET Efficiency Distributions of Polyprolines through a Hybrid Atomistic Simulation and Monte Carlo Approach",
 +      "PLoS ONE",
 +      6, 2011, "e19791" },
 +    { "Hockney1988",
 +      "R. W. Hockney and J. W. Eastwood",
 +      "Computer simulation using particles",
 +      "IOP, Bristol",
 +      1, 1988, "1" },
 +    { "Ballenegger2012",
 +      "V. Ballenegger, J.J. Cerda, and C. Holm",
 +      "How to Convert SPME to P3M: Influence Functions and Error Estimates",
 +      "J. Chem. Theory Comput.",
 +      8, 2012, "936-947" },
 +    { "Garmay2012",
 +      "Garmay Yu, Shvetsov A, Karelov D, Lebedev D, Radulescu A, Petukhov M, Isaev-Ivanov V",
 +      "Correlated motion of protein subdomains and large-scale conformational flexibility of RecA protein filament",
 +      "Journal of Physics: Conference Series",
 +      340, 2012, "012094" }
 +  };
 +#define NSTR (int)asize(citedb)
 +  
 +  int  j,index;
 +  char *author;
 +  char *title;
 +#define LINE_WIDTH 79
 +  
 +  if (fp == NULL)
 +    return;
 +
 +  for(index=0; (index<NSTR) && (strcmp(citedb[index].key,key) != 0); index++)
 +    ;
 +  
 +  fprintf(fp,"\n++++ PLEASE READ AND CITE THE FOLLOWING REFERENCE ++++\n");
 +  if (index < NSTR) {
 +    /* Insert newlines */
 +    author = wrap_lines(citedb[index].author,LINE_WIDTH,0,FALSE);
 +    title  = wrap_lines(citedb[index].title,LINE_WIDTH,0,FALSE);
 +    fprintf(fp,"%s\n%s\n%s %d (%d) pp. %s\n",
 +          author,title,citedb[index].journal,
 +          citedb[index].volume,citedb[index].year,
 +          citedb[index].pages);
 +    sfree(author);
 +    sfree(title);
 +  }
 +  else {
 +    fprintf(fp,"Entry %s not found in citation database\n",key);
 +  }
 +  fprintf(fp,"-------- -------- --- Thank You --- -------- --------\n\n");
 +  fflush(fp);
 +}
 +
 +#ifdef GMX_GIT_VERSION_INFO
 +/* Version information generated at compile time. */
 +#include "gromacs/utility/gitversion.h"
 +#else
 +/* Fall back to statically defined version. */
 +static const char _gmx_ver_string[]="VERSION " VERSION;
 +#endif
 +
 +const char *GromacsVersion()
 +{
 +  return _gmx_ver_string;
 +}
 +
 +void gmx_print_version_info(FILE *fp)
 +{
 +#ifdef GMX_GPU
 +    int cuda_driver,cuda_runtime;
 +#endif
 +
 +    fprintf(fp, "Gromacs version:    %s\n", _gmx_ver_string);
 +#ifdef GMX_GIT_VERSION_INFO
 +    fprintf(fp, "GIT SHA1 hash:      %s\n", _gmx_full_git_hash);
 +    /* Only print out the branch information if present.
 +     * The generating script checks whether the branch point actually
 +     * coincides with the hash reported above, and produces an empty string
 +     * in such cases. */
 +    if (_gmx_central_base_hash[0] != 0)
 +    {
 +        fprintf(fp, "Branched from:      %s\n", _gmx_central_base_hash);
 +    }
 +#endif
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Precision:          double\n");
 +#else
 +    fprintf(fp, "Precision:          single\n");
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +    fprintf(fp, "MPI library:        thread_mpi\n");
 +#elif defined(GMX_MPI)
 +    fprintf(fp, "MPI library:        MPI\n");
 +#else
 +    fprintf(fp, "MPI library:        none\n");
 +#endif
 +#ifdef GMX_OPENMP
 +    fprintf(fp, "OpenMP support:     enabled\n");
 +#else
 +    fprintf(fp, "OpenMP support:     disabled\n");
 +#endif
 +#ifdef GMX_GPU
 +    fprintf(fp, "GPU support:        enabled\n");
 +#else
 +    fprintf(fp, "GPU support:        disabled\n");
 +#endif
 +    /* A preprocessor trick to avoid duplicating logic from vec.h */
 +#define gmx_stringify2(x) #x
 +#define gmx_stringify(x) gmx_stringify2(x)
 +    fprintf(fp, "invsqrt routine:    %s\n", gmx_stringify(gmx_invsqrt(x)));
 +    fprintf(fp, "CPU acceleration:   %s\n", GMX_CPU_ACCELERATION_STRING);
 +
 +    /* TODO: Would be nicer to wrap this in a gmx_fft_version() call, but
 +     * since that is currently in mdlib, can wait for master. */
 +#ifdef GMX_FFT_FFTPACK
 +    fprintf(fp, "FFT library:        fftpack (built-in)\n");
 +#elif defined(GMX_FFT_FFTW3) && defined(GMX_NATIVE_WINDOWS)
 +    fprintf(fp, "FFT library:        %s\n", "fftw3");
 +#elif defined(GMX_FFT_FFTW3) && defined(GMX_DOUBLE)
 +    fprintf(fp, "FFT library:        %s\n", fftw_version);
 +#elif defined(GMX_FFT_FFTW3)
 +    fprintf(fp, "FFT library:        %s\n", fftwf_version);
 +#elif defined(GMX_FFT_MKL)
 +    fprintf(fp, "FFT library:        MKL\n");
 +#else
 +    fprintf(fp, "FFT library:        unknown\n");
 +#endif
 +#ifdef GMX_LARGEFILES
 +    fprintf(fp, "Large file support: enabled\n");
 +#else
 +    fprintf(fp, "Large file support: disabled\n");
 +#endif
 +#ifdef HAVE_RDTSCP
 +    fprintf(fp, "RDTSCP usage:       enabled\n");
 +#else
 +    fprintf(fp, "RDTSCP usage:       disabled\n");
 +#endif
 +
 +    fprintf(fp, "Built on:           %s\n", BUILD_TIME);
 +    fprintf(fp, "Built by:           %s\n", BUILD_USER);
 +    fprintf(fp, "Build OS/arch:      %s\n", BUILD_HOST);
 +    fprintf(fp, "Build CPU vendor:   %s\n", BUILD_CPU_VENDOR);
 +    fprintf(fp, "Build CPU brand:    %s\n", BUILD_CPU_BRAND);
 +    fprintf(fp, "Build CPU family:   %d   Model: %d   Stepping: %d\n",
 +            BUILD_CPU_FAMILY, BUILD_CPU_MODEL, BUILD_CPU_STEPPING);
 +    /* TODO: The below strings can be quite long, so it would be nice to wrap
 +     * them. Can wait for later, as the master branch has ready code to do all
 +     * that. */
 +    fprintf(fp, "Build CPU features: %s\n", BUILD_CPU_FEATURES);
 +    fprintf(fp, "C compiler:         %s\n", BUILD_C_COMPILER);
 +    fprintf(fp, "C compiler flags:   %s\n", BUILD_CFLAGS);
 +    if (BUILD_CXX_COMPILER[0] != '\0')
 +    {
 +        fprintf(fp, "C++ compiler:       %s\n", BUILD_CXX_COMPILER);
 +        fprintf(fp, "C++ compiler flags: %s\n", BUILD_CXXFLAGS);
 +    }
 +#ifdef HAVE_LIBMKL
 +    /* MKL might be used for LAPACK/BLAS even if FFTs use FFTW, so keep it separate */
 +    fprintf(fp, "Linked with Intel MKL version %s.%s.%s.\n",
 +            __INTEL_MKL__,__INTEL_MKL_MINOR__,__INTEL_MKL_UPDATE__);
 +#endif
 +#ifdef GMX_GPU
 +    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
 +    cuda_driver = 0;
 +    cudaDriverGetVersion(&cuda_driver);
 +    cuda_runtime = 0;
 +    cudaRuntimeGetVersion(&cuda_runtime);
 +    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
 +    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
 +#endif
 +
 +}
index 80491d43352a0f94af6e54452e1b44e86c19d5be,0000000000000000000000000000000000000000..9985f3dfb121658c23e79792007dc239c3ac8677
mode 100644,000000..100644
--- /dev/null
@@@ -1,610 -1,0 +1,619 @@@
-             gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-  
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROup of MAchos and Cynical Suckers
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +#include <string.h>
 +
 +#include "types/enums.h"
 +#include "types/hw_info.h"
 +#include "types/commrec.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "smalloc.h"
 +#include "gpu_utils.h"
 +#include "statutil.h"
 +#include "gmx_detect_hardware.h"
 +#include "main.h"
 +#include "md_logging.h"
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +#include "windows.h"
 +#endif
 +
 +/* Although we can't have more than 10 GPU different ID-s passed by the user as
 + * the id-s are assumed to be represented by single digits, as multiple
 + * processes can share a GPU, we can end up with more than 10 IDs.
 + * To account for potential extreme cases we'll set the limit to a pretty
 + * ridiculous number. */
 +static unsigned int max_gpu_ids_user = 64;
 +
++static const char* invalid_gpuid_hint =
++    "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
++
 +/* FW decl. */
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 +
 +static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
 +{
 +    int      i, ndev;
 +    char     stmp[STRLEN];
 +
 +    ndev = gpu_info->ncuda_dev;
 +
 +    sbuf[0] = '\0';
 +    for (i = 0; i < ndev; i++)
 +    {
 +        get_gpu_device_info_string(stmp, gpu_info, i);
 +        strcat(sbuf, "  ");
 +        strcat(sbuf, stmp);
 +        if (i < ndev - 1)
 +        {
 +            strcat(sbuf, "\n");
 +        }
 +    }
 +}
 +
 +static void print_gpu_detection_stats(FILE *fplog,
 +                                      const gmx_gpu_info_t *gpu_info,
 +                                      const t_commrec *cr)
 +{
 +    char onhost[266],stmp[STRLEN];
 +    int  ngpu;
 +
 +    ngpu = gpu_info->ncuda_dev;
 +
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    /* We only print the detection on one, of possibly multiple, nodes */
 +    strncpy(onhost," on host ",10);
 +    gmx_gethostname(onhost+9,256);
 +#else
 +    /* We detect all relevant GPUs */
 +    strncpy(onhost,"",1);
 +#endif
 +
 +    if (ngpu > 0)
 +    {
 +        sprint_gpus(stmp, gpu_info, TRUE);
 +        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
 +                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
 +    }
 +    else
 +    {
 +        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
 +    }
 +}
 +
 +static void print_gpu_use_stats(FILE *fplog,
 +                                const gmx_gpu_info_t *gpu_info,
 +                                const t_commrec *cr)
 +{
 +    char sbuf[STRLEN], stmp[STRLEN];
 +    int  i, ngpu, ngpu_all;
 +
 +    ngpu     = gpu_info->ncuda_dev_use;
 +    ngpu_all = gpu_info->ncuda_dev;
 +
 +    /* Issue note if GPUs are available but not used */
 +    if (ngpu_all > 0 && ngpu < 1)
 +    {
 +        sprintf(sbuf,
 +                "%d compatible GPU%s detected in the system, but none will be used.\n"
 +                "Consider trying GPU acceleration with the Verlet scheme!",
 +                ngpu_all, (ngpu_all > 1) ? "s" : "");
 +    }
 +    else
 +    {
 +        sprintf(sbuf, "%d GPU%s %sselected to be used for this run: ",
 +                ngpu, (ngpu > 1) ? "s" : "",
 +                gpu_info->bUserSet ? "user-" : "auto-");
 +        for (i = 0; i < ngpu; i++)
 +        {
 +            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
 +            if (i < ngpu - 1)
 +            {
 +                strcat(stmp, ", ");
 +            }
 +            strcat(sbuf, stmp);
 +        }
 +    }
 +    md_print_info(cr, fplog, "%s\n\n", sbuf);
 +}
 +
 +/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
 + * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
 +static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
 +{
 +    int  i;
 +    size_t len_idstr;
 +
 +    len_idstr = strlen(idstr);
 +
 +    if (len_idstr > max_gpu_ids_user)
 +    {
 +        gmx_fatal(FARGS,"%d GPU IDs provided, but only at most %d are supported",
 +                  len_idstr, max_gpu_ids_user);
 +    }
 +
 +    *nid = len_idstr;
 +
 +    for (i = 0; i < *nid; i++)
 +    {
 +        if (idstr[i] < '0' || idstr[i] > '9')
 +        {
-     /* Bail if binary is not compiled with GPU on */
++            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
++                      invalid_gpuid_hint, idstr[i]);
 +        }
 +        idlist[i] = idstr[i] - '0';
 +    }
 +}
 +
 +static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
 +{
 +    /* XXX implement cvs format to support more than 10 different GPUs in a box. */
 +    gmx_incons("Not implemented yet");
 +}
 +
 +void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                                      const t_commrec *cr, int ntmpi_requested,
 +                                      gmx_bool bUseGPU)
 +{
 +    int      npppn, ntmpi_pp, ngpu;
 +    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 +    char     gpu_plural[2];
 +    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
 +
 +    assert(hwinfo);
 +    assert(cr);
 +
 +    btMPI = bMPI = FALSE;
 +    bNthreadsAuto = FALSE;
 +#if defined(GMX_THREAD_MPI)
 +    btMPI = TRUE;
 +    bNthreadsAuto = (ntmpi_requested < 1);
 +#elif defined(GMX_LIB_MPI)
 +    bMPI  = TRUE;
 +#endif
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
 +    /* GPU emulation detection is done later, but we need here as well
 +     * -- uncool, but there's no elegant workaround */
 +    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
 +    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        /* check the acceleration mdrun is compiled with against hardware capabilities */
 +        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
 +         *       Might not hurt to add an extra check over MPI. */
 +        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
 +    }
 +
 +    /* Below we only do consistency checks for PP and GPUs,
 +     * this is irrelevant for PME only nodes, so in that case we return here.
 +     */
 +    if (!(cr->duty & DUTY_PP))
 +    {
 +        return;
 +    }
 +
 +    /* Need to ensure that we have enough GPUs:
 +     * - need one GPU per PP node
 +     * - no GPU oversubscription with tMPI
 +     * => keep on the GPU support, otherwise turn off (or bail if forced)
 +     * */
 +    /* number of PP processes per node */
 +    npppn = cr->nrank_pp_intranode;
 +
 +    pernode[0] = '\0';
 +    th_or_proc_plural[0] = '\0';
 +    if (btMPI)
 +    {
 +        sprintf(th_or_proc, "thread-MPI thread");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "s");
 +        }
 +    }
 +    else if (bMPI)
 +    {
 +        sprintf(th_or_proc, "MPI process");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "es");
 +        }
 +        sprintf(pernode, " per node");
 +    }
 +    else
 +    {
 +        /* neither MPI nor tMPI */
 +        sprintf(th_or_proc, "process");
 +    }
 +
 +    if (bGPUBin)
 +    {
 +        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +
 +    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +
 +        /* number of tMPI threads atuo-adjusted */
 +        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
 +        {
 +            if (npppn < ngpu)
 +            {
 +                if (hwinfo->gpu_info.bUserSet)
 +                {
 +                    /* The user manually provided more GPUs than threads we could
 +                     * automatically start. */
 +                    gmx_fatal(FARGS,
 +                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 +                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
 +                              ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
 +                }
 +                else
 +                {
 +                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 +                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
 +                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
 +                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
 +
 +                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (ngpu != npppn)
 +        {
 +            if (hwinfo->gpu_info.bUserSet)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 +                          th_or_proc, btMPI ? "s" : "es" , pernode,
 +                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +            }
 +            else
 +            {
 +                if (ngpu > npppn)
 +                {
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
 +                                  "      PP %s%s%s than GPU%s available.\n"
 +                                  "      Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
 +                                  ShortProgram(),
 +                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
 +                                  th_or_proc, npppn, gpu_plural, pernode);
 +
 +                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +                else
 +                {
 +                    /* Avoid duplicate error messages.
 +                     * Unfortunately we can only do this at the physical node
 +                     * level, since the hardware setup and MPI process count
 +                     * might be differ over physical nodes.
 +                     */
 +                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        gmx_fatal(FARGS,
 +                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
 +                                  th_or_proc, btMPI ? "s" : "es" , pernode,
 +                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +                    }
 +#ifdef GMX_MPI
 +                    else
 +                    {
 +                        /* Avoid other ranks to continue after inconsistency */
 +                        MPI_Barrier(cr->mpi_comm_mygroup);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +
 +        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
 +        {
 +            int i, j, same_count;
 +            gmx_bool bSomeSame, bAllDifferent;
 +
 +            same_count = 0;
 +            bSomeSame = FALSE;
 +            bAllDifferent = TRUE;
 +
 +            for (i = 0; i < ngpu - 1; i++)
 +            {
 +                for (j = i + 1; j < ngpu; j++)
 +                {
 +                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
 +                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                }
 +            }
 +
 +            if (btMPI && !bAllDifferent)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
 +                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
 +            }
 +
 +            if (bSomeSame)
 +            {
 +                md_print_warn(cr,fplog,
 +                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
 +                              "      multiple %s%s; this should be avoided as it generally\n"
 +                              "      causes performance loss.",
 +                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
 +            }
 +        }
 +        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +}
 +
 +/* Return the number of hardware threads supported by the current CPU.
 + * We assume that this is equal with the number of CPUs reported to be
 + * online by the OS at the time of the call.
 + */
 +static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
 +{
 +     int ret = 0;
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +    /* Windows */
 +    SYSTEM_INFO sysinfo;
 +    GetSystemInfo( &sysinfo );
 +    ret = sysinfo.dwNumberOfProcessors;
 +#elif defined HAVE_SYSCONF
 +    /* We are probably on Unix.
 +     * Now check if we have the argument to use before executing the call
 +     */
 +#if defined(_SC_NPROCESSORS_ONLN)
 +    ret = sysconf(_SC_NPROCESSORS_ONLN);
 +#elif defined(_SC_NPROC_ONLN)
 +    ret = sysconf(_SC_NPROC_ONLN);
 +#elif defined(_SC_NPROCESSORS_CONF)
 +    ret = sysconf(_SC_NPROCESSORS_CONF);
 +#elif defined(_SC_NPROC_CONF)
 +    ret = sysconf(_SC_NPROC_CONF);
 +#endif /* End of check for sysconf argument values */
 +
 +#else
 +    /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
 +    ret = -1;
 +#endif
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Detected %d processors, will use this as the number "
 +                "of supported hardware threads.\n", ret);
 +    }
 +
 +#ifdef GMX_OMPENMP
 +    if (ret != gmx_omp_get_num_procs())
 +    {
 +        md_print_warn(cr, fplog,
 +                      "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
 +                      "Consider setting the launch configuration manually!",
 +                      ret, gmx_omp_get_num_procs());
 +    }
 +#endif
 +
 +    return ret;
 +}
 +
 +void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                         const t_commrec *cr,
 +                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
 +                         const char *gpu_id)
 +{
 +    int             i;
 +    const char      *env;
 +    char            sbuf[STRLEN], stmp[STRLEN];
 +    gmx_hw_info_t   *hw;
 +    gmx_gpu_info_t  gpuinfo_auto, gpuinfo_user;
 +    gmx_bool        bGPUBin;
 +
 +    assert(hwinfo);
 +
 +    /* detect CPUID info; no fuss, we don't detect system-wide
 +     * -- sloppy, but that's it for now */
 +    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
 +    }
 +
 +    /* detect number of hardware threads */
 +    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
 +
 +    /* detect GPUs */
 +    hwinfo->gpu_info.ncuda_dev_use  = 0;
 +    hwinfo->gpu_info.cuda_dev_use   = NULL;
 +    hwinfo->gpu_info.ncuda_dev      = 0;
 +    hwinfo->gpu_info.cuda_dev       = NULL;
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
-         gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
++    /* Bail if binary is not compiled with GPU acceleration, but this is either
++     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
 +    if (bForceUseGPU && !bGPUBin)
 +    {
-                 gmx_fatal(FARGS, "Empty GPU ID string passed\n");
++        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
++    }
++    if (gpu_id != NULL && !bGPUBin)
++    {
++        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
 +    }
 +
 +    /* run the detection if the binary was compiled with GPU support */
 +    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
 +    {
 +        char detection_error[STRLEN];
 +
 +        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
 +        {
 +            if (detection_error != NULL && detection_error[0] != '\0')
 +            {
 +                sprintf(sbuf, ":\n      %s\n", detection_error);
 +            }
 +            else
 +            {
 +                sprintf(sbuf, ".");
 +            }
 +            md_print_warn(cr, fplog,
 +                          "NOTE: Error occurred during GPU detection%s"
 +                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
 +                          sbuf);
 +        }
 +    }
 +
 +    if (bForceUseGPU || bTryUseGPU)
 +    {
 +        env = getenv("GMX_GPU_ID");
 +        if (env != NULL && gpu_id != NULL)
 +        {
 +            gmx_fatal(FARGS,"GMX_GPU_ID and -gpu_id can not be used at the same time");
 +        }
 +        if (env == NULL)
 +        {
 +            env = gpu_id;
 +        }
 +
 +        /* parse GPU IDs if the user passed any */
 +        if (env != NULL)
 +        {
 +            int *gpuid, *checkres;
 +            int nid, res;
 +
 +            snew(gpuid, max_gpu_ids_user);
 +            snew(checkres, max_gpu_ids_user);
 +
 +            parse_gpu_id_plain_string(env, &nid, gpuid);
 +
 +            if (nid == 0)
 +            {
++                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
 +            }
 +
 +            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
 +
 +            if (!res)
 +            {
 +                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +
 +                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
 +                for (i = 0; i < nid; i++)
 +                {
 +                    if (checkres[i] != egpuCompatible)
 +                    {
 +                        sprintf(stmp, "    GPU #%d: %s\n",
 +                                gpuid[i], gpu_detect_res_str[checkres[i]]);
 +                        strcat(sbuf, stmp);
 +                    }
 +                }
 +                gmx_fatal(FARGS, "%s", sbuf);
 +            }
 +
 +            hwinfo->gpu_info.bUserSet = TRUE;
 +
 +            sfree(gpuid);
 +            sfree(checkres);
 +        }
 +        else
 +        {
 +            pick_compatible_gpus(&hwinfo->gpu_info);
 +            hwinfo->gpu_info.bUserSet = FALSE;
 +        }
 +
 +        /* decide whether we can use GPU */
 +        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
 +        if (!hwinfo->bCanUseGPU && bForceUseGPU)
 +        {
 +            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
 +        }
 +    }
 +}
 +
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
 +{
 +    int ndev_use;
 +
 +    assert(hwinfo);
 +
 +    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
 +
 +    if (count > ndev_use)
 +    {
 +        /* won't increase the # of GPUs */
 +        return;
 +    }
 +
 +    if (count < 1)
 +    {
 +        char sbuf[STRLEN];
 +        sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
 +                ndev_use, count);
 +        gmx_incons(sbuf);
 +    }
 +
 +    /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
 +    hwinfo->gpu_info.ncuda_dev_use = count;
 +}
 +
 +void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
 +{
 +    if (hwinfo)
 +    {
 +        gmx_cpuid_done(hwinfo->cpuid_info);
 +        free_gpu_info(&hwinfo->gpu_info);
 +        sfree(hwinfo);
 +    }
 +}
index bf45130afe6e88ac7d032ace1942a6d4bd3f0874,0000000000000000000000000000000000000000..41096cde0f22f676241823588bb5821cdb842e32
mode 100644,000000..100644
--- /dev/null
@@@ -1,25 -1,0 +1,25 @@@
- file(GLOB GPU_UTILS_SOURCES *.cu)
 +# (slightly sloppy) OS definitions required by memtestG80
 +set(_os_def)
 +if(UNIX)
 +    if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 +        set(_os_def "-DOSX")
 +    else() # everything that's UNIX & UNIX-like except OS X
 +        set(_os_def "-DLINUX")
 +    endif()
 +else()
 +    if(WIN32)
 +        set(_os_def "-DWINDOWS")
 +    else()
 +        message(FATAL_ERROR " Could not detect OS required for memtestG80.")
 +    endif()
 +endif()
 +
 +CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 +set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)        
++file(GLOB GPU_UTILS_SOURCES *.cu dummy.cpp)
 +CUDA_ADD_LIBRARY(gpu_utils STATIC ${GPU_UTILS_SOURCES}
 +                 OPTIONS ${_os_def}
 +                 RELWITHDEBINFO -g
 +                 DEBUG -g -D_DEBUG_=1 )
 +
 +CUDA_BUILD_CLEAN_TARGET()
index 0000000000000000000000000000000000000000,aa1d4efdf954f6cdb7152af6dde776ee0141f40d..aa1d4efdf954f6cdb7152af6dde776ee0141f40d
mode 000000,100644..100644
--- /dev/null
index fb84ca0e81185f2fc37c61f3cfdf2696205e45a9,0000000000000000000000000000000000000000..09c64613796c77da8e740e378b83fe945deb680e
mode 100644,000000..100644
--- /dev/null
@@@ -1,716 -1,0 +1,713 @@@
- #ifndef GMX_X86_SSE2
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.03
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <assert.h>
 +
 +#include <sys/types.h>
 +#include <math.h>
 +#include "typedefs.h"
 +#include "physics.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "coulomb.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_consts.h"
 +
 +/* Struct for unique atom type for calculating the energy drift.
 + * The atom displacement depends on mass and constraints.
 + * The energy jump for given distance depend on LJ type and q.
 + */
 +typedef struct
 +{
 +    real     mass; /* mass */
 +    int      type; /* type (used for LJ parameters) */
 +    real     q;    /* charge */
 +    int      con;  /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
 +    int      n;    /* total #atoms of this type in the system */
 +} verletbuf_atomtype_t;
 +
 +
 +void verletbuf_get_list_setup(gmx_bool bGPU,
 +                              verletbuf_list_setup_t *list_setup)
 +{
 +    list_setup->cluster_size_i     = NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (bGPU)
 +    {
 +        list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
 +    }
 +    else
 +    {
-         int simd_width;
- #ifdef GMX_X86_AVX_256
-         simd_width = 256;
- #else
-         simd_width = 128;
++#ifndef GMX_NBNXN_SIMD
 +        list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
 +#else
-         list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
++        list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
++#ifdef GMX_NBNXN_SIMD_2XNN
++        /* We assume the smallest cluster size to be on the safe side */
++        list_setup->cluster_size_j /= 2;
 +#endif
 +#endif
 +    }
 +}
 +
 +static void add_at(verletbuf_atomtype_t **att_p,int *natt_p,
 +                   real mass,int type,real q,int con,int nmol)
 +{
 +    verletbuf_atomtype_t *att;
 +    int natt,i;
 +
 +    if (mass == 0)
 +    {
 +        /* Ignore massless particles */
 +        return;
 +    }
 +
 +    att  = *att_p;
 +    natt = *natt_p;
 +
 +    i = 0;
 +    while (i < natt &&
 +           !(mass == att[i].mass &&
 +             type == att[i].type &&
 +             q    == att[i].q &&
 +             con  == att[i].con))
 +    {
 +        i++;
 +    }
 +
 +    if (i < natt)
 +    {
 +        att[i].n += nmol;
 +    }
 +    else
 +    {
 +        (*natt_p)++;
 +        srenew(*att_p,*natt_p);
 +        (*att_p)[i].mass = mass;
 +        (*att_p)[i].type = type;
 +        (*att_p)[i].q    = q;
 +        (*att_p)[i].con  = con;
 +        (*att_p)[i].n    = nmol;
 +    }
 +}
 +
 +static void get_verlet_buffer_atomtypes(const gmx_mtop_t *mtop,
 +                                        verletbuf_atomtype_t **att_p,
 +                                        int *natt_p,
 +                                        int *n_nonlin_vsite)
 +{
 +    verletbuf_atomtype_t *att;
 +    int natt;
 +    int mb,nmol,ft,i,j,a1,a2,a3,a;
 +    const t_atoms *atoms;
 +    const t_ilist *il;
 +    const t_atom *at;
 +    const t_iparams *ip;
 +    real *con_m,*vsite_m,cam[5];
 +
 +    att  = NULL;
 +    natt = 0;
 +
 +    if (n_nonlin_vsite != NULL)
 +    {
 +        *n_nonlin_vsite = 0;
 +    }
 +
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        nmol = mtop->molblock[mb].nmol;
 +
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +
 +        /* Check for constraints, as they affect the kinetic energy */
 +        snew(con_m,atoms->nr);
 +        snew(vsite_m,atoms->nr);
 +
 +        for(ft=F_CONSTR; ft<=F_CONSTRNC; ft++)
 +        {
 +            il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
 +
 +            for(i=0; i<il->nr; i+=1+NRAL(ft))
 +            {
 +                a1 = il->iatoms[i+1];
 +                a2 = il->iatoms[i+2];
 +                con_m[a1] += atoms->atom[a2].m;
 +                con_m[a2] += atoms->atom[a1].m;
 +            }
 +        }
 +
 +        il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
 +
 +        for(i=0; i<il->nr; i+=1+NRAL(F_SETTLE))
 +        {
 +            a1 = il->iatoms[i+1];
 +            a2 = il->iatoms[i+2];
 +            a3 = il->iatoms[i+3];
 +            con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
 +            con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
 +            con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
 +        }
 +
 +        /* Check for virtual sites, determine mass from constructing atoms */
 +        for(ft=0; ft<F_NRE; ft++)
 +        {
 +            if (IS_VSITE(ft))
 +            {
 +                il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
 +
 +                for(i=0; i<il->nr; i+=1+NRAL(ft))
 +                {
 +                    ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +
 +                    a1 = il->iatoms[i+1];
 +
 +                    for(j=1; j<NRAL(ft); j++)
 +                    {
 +                        cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
 +                        if (cam[j] == 0)
 +                        {
 +                            cam[j] = vsite_m[il->iatoms[i+1+j]];
 +                        }
 +                        if (cam[j] == 0)
 +                        {
 +                            gmx_fatal(FARGS,"In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
 +                                      *mtop->moltype[mtop->molblock[mb].type].name,
 +                                      interaction_function[ft].longname,
 +                                      il->iatoms[i+1+j]+1);
 +                        }
 +                    }
 +
 +                    switch(ft)
 +                    {
 +                    case F_VSITE2:
 +                        /* Exact except for ignoring constraints */
 +                        vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
 +                        break;
 +                    case F_VSITE3:
 +                        /* Exact except for ignoring constraints */
 +                        vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
 +                        break;
 +                    default:
 +                        /* Use the mass of the lightest constructing atom.
 +                         * This is an approximation.
 +                         * If the distance of the virtual site to the
 +                         * constructing atom is less than all distances
 +                         * between constructing atoms, this is a safe
 +                         * over-estimate of the displacement of the vsite.
 +                         * This condition holds for all H mass replacement
 +                         * replacement vsite constructions, except for SP2/3
 +                         * groups. In SP3 groups one H will have a F_VSITE3
 +                         * construction, so even there the total drift
 +                         * estimation shouldn't be far off.
 +                         */
 +                        assert(j>=1);
 +                        vsite_m[a1] = cam[1];
 +                        for(j=2; j<NRAL(ft); j++)
 +                        {
 +                            vsite_m[a1] = min(vsite_m[a1],cam[j]);
 +                        }
 +                        if (n_nonlin_vsite != NULL)
 +                        {
 +                            *n_nonlin_vsite += nmol;
 +                        }
 +                        break;
 +                    }
 +                }
 +            }
 +        }
 +
 +        for(a=0; a<atoms->nr; a++)
 +        {
 +            at = &atoms->atom[a];
 +            /* We consider an atom constrained, #DOF=2, when it is
 +             * connected with constraints to one or more atoms with
 +             * total mass larger than 1.5 that of the atom itself.
 +             */
 +            add_at(&att,&natt,
 +                   at->m,at->type,at->q,con_m[a] > 1.5*at->m,nmol);
 +        }
 +
 +        sfree(vsite_m);
 +        sfree(con_m);
 +    }
 +
 +    if (gmx_debug_at)
 +    {
 +        for(a=0; a<natt; a++)
 +        {
 +            fprintf(debug,"type %d: m %5.2f t %d q %6.3f con %d n %d\n",
 +                    a,att[a].mass,att[a].type,att[a].q,att[a].con,att[a].n);
 +        }
 +    }
 +
 +    *att_p  = att;
 +    *natt_p = natt;
 +}
 +
 +static void approx_2dof(real s2,real x,
 +                        real *shift,real *scale)
 +{
 +    /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
 +     * This code is also used for particles with multiple constraints,
 +     * in which case we overestimate the displacement.
 +     * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
 +     * We approximate this with scale*Gaussian(s,r+shift),
 +     * by matching the distribution value and derivative at x.
 +     * This is a tight overestimate for all r>=0 at any s and x.
 +     */
 +    real ex,er;
 +
 +    ex = exp(-x*x/(2*s2));
 +    er = gmx_erfc(x/sqrt(2*s2));
 +
 +    *shift = -x + sqrt(2*s2/M_PI)*ex/er;
 +    *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
 +}
 +
 +static real ener_drift(const verletbuf_atomtype_t *att,int natt,
 +                       const gmx_ffparams_t *ffp,
 +                       real kT_fac,
 +                       real md_ljd,real md_ljr,real md_el,real dd_el,
 +                       real r_buffer,
 +                       real rlist,real boxvol)
 +{
 +    double drift_tot,pot1,pot2,pot;
 +    int    i,j;
 +    real   s2i,s2j,s2,s;
 +    int    ti,tj;
 +    real   md,dd;
 +    real   sc_fac,rsh;
 +    double c_exp,c_erfc;
 +
 +    drift_tot = 0;
 +
 +    /* Loop over the different atom type pairs */
 +    for(i=0; i<natt; i++)
 +    {
 +        s2i = kT_fac/att[i].mass;
 +        ti  = att[i].type;
 +
 +        for(j=i; j<natt; j++)
 +        {
 +            s2j = kT_fac/att[j].mass;
 +            tj = att[j].type;
 +
 +            /* Note that attractive and repulsive potentials for individual
 +             * pairs will partially cancel.
 +             */
 +            /* -dV/dr at the cut-off for LJ + Coulomb */
 +            md =
 +                md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
 +                md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
 +                md_el*att[i].q*att[j].q;
 +
 +            /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
 +            dd = dd_el*att[i].q*att[j].q;
 +
 +            s2  = s2i + s2j;
 +
 +            rsh    = r_buffer;
 +            sc_fac = 1.0;
 +            /* For constraints: adapt r and scaling for the Gaussian */
 +            if (att[i].con)
 +            {
 +                real sh,sc;
 +                approx_2dof(s2i,r_buffer*s2i/s2,&sh,&sc);
 +                rsh    += sh;
 +                sc_fac *= sc;
 +            }
 +            if (att[j].con)
 +            {
 +                real sh,sc;
 +                approx_2dof(s2j,r_buffer*s2j/s2,&sh,&sc);
 +                rsh    += sh;
 +                sc_fac *= sc;
 +            }
 +
 +            /* Exact contribution of an atom pair with Gaussian displacement
 +             * with sigma s to the energy drift for a potential with
 +             * derivative -md and second derivative dd at the cut-off.
 +             * The only catch is that for potentials that change sign
 +             * near the cut-off there could be an unlucky compensation
 +             * of positive and negative energy drift.
 +             * Such potentials are extremely rare though.
 +             *
 +             * Note that pot has unit energy*length, as the linear
 +             * atom density still needs to be put in.
 +             */
 +            c_exp  = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
 +            c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
 +            s      = sqrt(s2);
 +
 +            pot1 = sc_fac*
 +                md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
 +            pot2 = sc_fac*
 +                dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
 +            pot = pot1 + pot2;
 +
 +            if (gmx_debug_at)
 +            {
 +                fprintf(debug,"n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
 +                        att[i].n,att[j].n,sqrt(s2i),sqrt(s2j),
 +                        att[i].con+att[j].con,
 +                        md,dd,pot1,pot2,pot);
 +            }
 +
 +            /* Multiply by the number of atom pairs */
 +            if (j == i)
 +            {
 +                pot *= (double)att[i].n*(att[i].n - 1)/2;
 +            }
 +            else
 +            {
 +                pot *= (double)att[i].n*att[j].n;
 +            }
 +            /* We need the line density to get the energy drift of the system.
 +             * The effective average r^2 is close to (rlist+sigma)^2.
 +             */
 +            pot *= 4*M_PI*sqr(rlist + s)/boxvol;
 +
 +            /* Add the unsigned drift to avoid cancellation of errors */
 +            drift_tot += fabs(pot);
 +        }
 +    }
 +
 +    return drift_tot;
 +}
 +
 +static real surface_frac(int cluster_size,real particle_distance,real rlist)
 +{
 +    real d,area_rel;
 +
 +    if (rlist < 0.5*particle_distance)
 +    {
 +        /* We have non overlapping spheres */
 +        return 1.0;
 +    }
 +
 +    /* Half the inter-particle distance relative to rlist */
 +    d = 0.5*particle_distance/rlist;
 +
 +    /* Determine the area of the surface at distance rlist to the closest
 +     * particle, relative to surface of a sphere of radius rlist.
 +     * The formulas below assume close to cubic cells for the pair search grid,
 +     * which the pair search code tries to achieve.
 +     * Note that in practice particle distances will not be delta distributed,
 +     * but have some spread, often involving shorter distances,
 +     * as e.g. O-H bonds in a water molecule. Thus the estimates below will
 +     * usually be slightly too high and thus conservative.
 +     */
 +    switch (cluster_size)
 +    {
 +    case 1:
 +        /* One particle: trivial */
 +        area_rel = 1.0;
 +        break;
 +    case 2:
 +        /* Two particles: two spheres at fractional distance 2*a */
 +        area_rel = 1.0 + d;
 +        break;
 +    case 4:
 +        /* We assume a perfect, symmetric tetrahedron geometry.
 +         * The surface around a tetrahedron is too complex for a full
 +         * analytical solution, so we use a Taylor expansion.
 +         */
 +        area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
 +                                  sqrt(3)*d*d*(1.0 +
 +                                               5.0/18.0*d*d +
 +                                               7.0/45.0*d*d*d*d +
 +                                               83.0/756.0*d*d*d*d*d*d)));
 +        break;
 +    default:
 +        gmx_incons("surface_frac called with unsupported cluster_size");
 +        area_rel = 1.0;
 +    }
 +        
 +    return area_rel/cluster_size;
 +}
 +
 +void calc_verlet_buffer_size(const gmx_mtop_t *mtop,real boxvol,
 +                             const t_inputrec *ir,real drift_target,
 +                             const verletbuf_list_setup_t *list_setup,
 +                             int *n_nonlin_vsite,
 +                             real *rlist)
 +{
 +    double resolution;
 +    char *env;
 +
 +    real particle_distance;
 +    real nb_clust_frac_pairs_not_in_list_at_cutoff;
 +
 +    verletbuf_atomtype_t *att=NULL;
 +    int  natt=-1,i;
 +    double reppow;
 +    real md_ljd,md_ljr,md_el,dd_el;
 +    real elfac;
 +    real kT_fac,mass_min;
 +    int  ib0,ib1,ib;
 +    real rb,rl;
 +    real drift;
 +
 +    /* Resolution of the buffer size */
 +    resolution = 0.001;
 +
 +    env = getenv("GMX_VERLET_BUFFER_RES");
 +    if (env != NULL)
 +    {
 +        sscanf(env,"%lf",&resolution);
 +    }
 +
 +    /* In an atom wise pair-list there would be no pairs in the list
 +     * beyond the pair-list cut-off.
 +     * However, we use a pair-list of groups vs groups of atoms.
 +     * For groups of 4 atoms, the parallelism of SSE instructions, only
 +     * 10% of the atoms pairs are not in the list just beyond the cut-off.
 +     * As this percentage increases slowly compared to the decrease of the
 +     * Gaussian displacement distribution over this range, we can simply
 +     * reduce the drift by this fraction.
 +     * For larger groups, e.g. of 8 atoms, this fraction will be lower,
 +     * so then buffer size will be on the conservative (large) side.
 +     *
 +     * Note that the formulas used here do not take into account
 +     * cancellation of errors which could occur by missing both
 +     * attractive and repulsive interactions.
 +     *
 +     * The only major assumption is homogeneous particle distribution.
 +     * For an inhomogeneous system, such as a liquid-vapor system,
 +     * the buffer will be underestimated. The actual energy drift
 +     * will be higher by the factor: local/homogeneous particle density.
 +     *
 +     * The results of this estimate have been checked againt simulations.
 +     * In most cases the real drift differs by less than a factor 2.
 +     */
 +
 +    /* Worst case assumption: HCP packing of particles gives largest distance */
 +    particle_distance = pow(boxvol*sqrt(2)/mtop->natoms,1.0/3.0);
 +
 +    get_verlet_buffer_atomtypes(mtop,&att,&natt,n_nonlin_vsite);
 +    assert(att != NULL && natt >= 0);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"particle distance assuming HCP packing: %f nm\n",
 +                particle_distance);
 +        fprintf(debug,"energy drift atom types: %d\n",natt);
 +    }
 +
 +    reppow = mtop->ffparams.reppow;
 +    md_ljd = 0;
 +    md_ljr = 0;
 +    if (ir->vdwtype == evdwCUT)
 +    {
 +        /* -dV/dr of -r^-6 and r^-repporw */
 +        md_ljd = -6*pow(ir->rvdw,-7.0);
 +        md_ljr = reppow*pow(ir->rvdw,-(reppow+1));
 +        /* The contribution of the second derivative is negligible */
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
 +    }
 +
 +    elfac = ONE_4PI_EPS0/ir->epsilon_r;
 +
 +    /* Determine md=-dV/dr and dd=d^2V/dr^2 */
 +    md_el = 0;
 +    dd_el = 0;
 +    if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
 +    {
 +        real eps_rf,k_rf;
 +
 +        if (ir->coulombtype == eelCUT)
 +        {
 +            eps_rf = 1;
 +            k_rf = 0;
 +        }
 +        else
 +        {
 +            eps_rf = ir->epsilon_rf/ir->epsilon_r;
 +            if (eps_rf != 0)
 +            {
 +                k_rf = pow(ir->rcoulomb,-3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
 +            }
 +            else
 +            {
 +                /* epsilon_rf = infinity */
 +                k_rf = 0.5*pow(ir->rcoulomb,-3.0);
 +            }
 +        }
 +
 +        if (eps_rf > 0)
 +        {
 +            md_el = elfac*(pow(ir->rcoulomb,-2.0) - 2*k_rf*ir->rcoulomb);
 +        }
 +        dd_el = elfac*(2*pow(ir->rcoulomb,-3.0) + 2*k_rf);
 +    }
 +    else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
 +    {
 +        real b,rc,br;
 +
 +        b  = calc_ewaldcoeff(ir->rcoulomb,ir->ewald_rtol);
 +        rc = ir->rcoulomb;
 +        br = b*rc;
 +        md_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc));
 +        dd_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc);
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
 +    }
 +
 +    /* Determine the variance of the atomic displacement
 +     * over nstlist-1 steps: kT_fac
 +     * For inertial dynamics (not Brownian dynamics) the mass factor
 +     * is not included in kT_fac, it is added later.
 +     */
 +    if (ir->eI == eiBD)
 +    {
 +        /* Get the displacement distribution from the random component only.
 +         * With accurate integration the systematic (force) displacement
 +         * should be negligible (unless nstlist is extremely large, which
 +         * you wouldn't do anyhow).
 +         */
 +        kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
 +        if (ir->bd_fric > 0)
 +        {
 +            /* This is directly sigma^2 of the displacement */
 +            kT_fac /= ir->bd_fric;
 +
 +            /* Set the masses to 1 as kT_fac is the full sigma^2,
 +             * but we divide by m in ener_drift().
 +             */
 +            for(i=0; i<natt; i++)
 +            {
 +                att[i].mass = 1;
 +            }
 +        }
 +        else
 +        {
 +            real tau_t;
 +
 +            /* Per group tau_t is not implemented yet, use the maximum */
 +            tau_t = ir->opts.tau_t[0];
 +            for(i=1; i<ir->opts.ngtc; i++)
 +            {
 +                tau_t = max(tau_t,ir->opts.tau_t[i]);
 +            }
 +
 +            kT_fac *= tau_t;
 +            /* This kT_fac needs to be divided by the mass to get sigma^2 */
 +        }
 +    }
 +    else
 +    {
 +        kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
 +    }
 +
 +    mass_min = att[0].mass;
 +    for(i=1; i<natt; i++)
 +    {
 +        mass_min = min(mass_min,att[i].mass);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"md_ljd %e md_ljr %e\n",md_ljd,md_ljr);
 +        fprintf(debug,"md_el %e dd_el %e\n",md_el,dd_el);
 +        fprintf(debug,"sqrt(kT_fac) %f\n",sqrt(kT_fac));
 +        fprintf(debug,"mass_min %f\n",mass_min);
 +    }
 +
 +    /* Search using bisection */
 +    ib0 = -1;
 +    /* The drift will be neglible at 5 times the max sigma */
 +    ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
 +    while (ib1 - ib0 > 1)
 +    {
 +        ib = (ib0 + ib1)/2;
 +        rb = ib*resolution;
 +        rl = max(ir->rvdw,ir->rcoulomb) + rb;
 +
 +        /* Calculate the average energy drift at the last step
 +         * of the nstlist steps at which the pair-list is used.
 +         */
 +        drift = ener_drift(att,natt,&mtop->ffparams,
 +                           kT_fac,
 +                           md_ljd,md_ljr,md_el,dd_el,rb,
 +                           rl,boxvol);
 +
 +        /* Correct for the fact that we are using a Ni x Nj particle pair list
 +         * and not a 1 x 1 particle pair list. This reduces the drift.
 +         */
 +        /* We don't have a formula for 8 (yet), use 4 which is conservative */
 +        nb_clust_frac_pairs_not_in_list_at_cutoff =
 +            surface_frac(min(list_setup->cluster_size_i,4),
 +                         particle_distance,rl)*
 +            surface_frac(min(list_setup->cluster_size_j,4),
 +                         particle_distance,rl);
 +        drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
 +
 +        /* Convert the drift to drift per unit time per atom */
 +        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
 +                    ib0,ib,ib1,rb,
 +                    list_setup->cluster_size_i,list_setup->cluster_size_j,
 +                    nb_clust_frac_pairs_not_in_list_at_cutoff,
 +                    drift);
 +        }
 +
 +        if (fabs(drift) > drift_target)
 +        {
 +            ib0 = ib;
 +        }
 +        else
 +        {
 +            ib1 = ib;
 +        }
 +    }
 +
 +    sfree(att);
 +
 +    *rlist = max(ir->rvdw,ir->rcoulomb) + ib1*resolution;
 +}
index d84c2699fcce301a6a3020d5e5ebf4cab0ad2a6c,0000000000000000000000000000000000000000..496e3fd7e62a9bbcd6de93445b0be1b51b79a5b2
mode 100644,000000..100644
--- /dev/null
@@@ -1,156 -1,0 +1,156 @@@
- extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _readir_h
 +#define _readir_h
 +
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "grompp.h"
 +
 +enum { eshNONE, eshHBONDS, eshALLBONDS, eshHANGLES, eshALLANGLES, eshNR };
 +
 +static const char *constraints[eshNR+1]    = { 
 +  "none", "h-bonds", "all-bonds", "h-angles", "all-angles", NULL 
 +};
 +
 +enum { ecouplamVDWQ, ecouplamVDW, ecouplamQ, ecouplamNONE, ecouplamNR };
 +
 +static const char *couple_lam[ecouplamNR+1]    = { 
 +  "vdw-q", "vdw", "q", "none", NULL 
 +};
 +
 +typedef struct {
 +  int warnings;
 +  int nshake;
 +  char *include;
 +  char *define;
 +  gmx_bool bGenVel;
 +  gmx_bool bGenPairs;
 +  real tempi;
 +  int  seed;
 +  gmx_bool bOrire;
 +  gmx_bool bMorse;
 +  char *wall_atomtype[2];
 +  gmx_bool pull_start;
 +  char *couple_moltype;
 +  int  couple_lam0;
 +  int  couple_lam1;
 +  gmx_bool bCoupleIntra;
 +} t_gromppopts;
 +
 +
 +extern void init_ir(t_inputrec *ir, t_gromppopts *opts);
 +/* Initiate stuff */
 +
 +extern void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
 +                   warninp_t wi);
 +/* Validate inputrec data.
 + * Fatal errors will be added to nerror.
 + */
 +extern int search_string(char *s,int ng,char *gn[]);
 +/* Returns the index of string s in the index groups */
 +
 +extern void double_check(t_inputrec *ir,matrix box,gmx_bool bConstr,
 +                       warninp_t wi);
 +/* Do more checks */
 +
 +extern void triple_check(const char *mdparin,t_inputrec *ir,gmx_mtop_t *sys,
 +                       warninp_t wi);
 +/* Do even more checks */
 +
 +extern void check_chargegroup_radii(const gmx_mtop_t *mtop,const t_inputrec *ir,
 +                                  rvec *x,
 +                                  warninp_t wi);
 +/* Even more checks, charge group radii vs. cut-off's only. */
 +
 +extern void get_ir(const char *mdparin,const char *mdparout,
 +                 t_inputrec *ir,t_gromppopts *opts,
 +                 warninp_t wi);
 +/* Read the input file, and retrieve data for inputrec.
 + * More data are read, but the are only evaluated when the next
 + * function is called. Also prints the input file back to mdparout.
 + */
 + 
 +extern void do_index(const char* mdparin, 
 +                   const char *ndx,
 +                   gmx_mtop_t *mtop,
 +                   gmx_bool bVerbose,
 +                   t_inputrec *ir,
 +                   rvec *v,
 +                   warninp_t wi);
 +/* Read the index file and assign grp numbers to atoms.
 + * If v is not NULL, the velocities will be scaled to the correct number
 + * of degrees of freedom.
 + */
 +
 +/* Routines In readpull.c */
 +
 +extern char **read_pullparams(int *ninp_p,t_inpfile **inp,
 +                            t_pull *pull,gmx_bool *bStart,
 +                            warninp_t wi);
 +/* Reads the pull parameters, returns a list of the pull group names */
 +
 +extern void make_pull_groups(t_pull *pull,char **pgnames,
 +                           t_blocka *grps,char **gnames);
 +/* Process the pull parameters after reading the index groups */
 +
++extern void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box, real lambda,
 +                        const output_env_t oenv, gmx_bool bStart);
 +/* Prints the initial pull group distances in x.
 + * If bStart adds the distance to the initial reference location.
 + */
 +
 +extern int str_nelem(const char *str,int maxptr,char *ptr[]);
 +/* helper function from readir.c to convert strings */
 +
 +extern void read_adressparams(int *ninp_p,t_inpfile **inp_p,t_adress *adress, warninp_t wi);
 +/* Reads in AdResS related parameters */
 +
 +extern void do_adress_index(t_adress *adress, gmx_groups_t *groups,char **gnames,t_grpopts *opts,warninp_t wi);
 +/* Generate adress groups */
 +
 +extern char **read_rotparams(int *ninp_p,t_inpfile **inp,t_rot *rot,warninp_t wi);
 +/* Reads enforced rotation parameters, returns a list of the rot group names */
 +
 +extern void make_rotation_groups(t_rot *rot,char **rotgnames,
 +                 t_blocka *grps,char **gnames);
 +/* Process the rotation parameters after reading the index groups */
 +
 +extern void set_reference_positions(t_rot *rot, gmx_mtop_t *mtop, rvec *x, matrix box,
 +        const char *fn, gmx_bool bSet, warninp_t wi);
 +
 +#endif        /* _readir_h */
index 4523eed2d95cdfecc1b7e33457256fb34b6f9308,0000000000000000000000000000000000000000..57a809ddd3f09f107f4d97bde1401425d7c1167a
mode 100644,000000..100644
--- /dev/null
@@@ -1,352 -1,0 +1,347 @@@
- void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <stdlib.h>
 +#include "sysstuff.h"
 +#include "princ.h"
 +#include "futil.h"
 +#include "statutil.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "symtab.h"
 +#include "readinp.h"
 +#include "readir.h"
 +#include <string.h>
 +#include "mdatoms.h"
 +#include "pbc.h"
 +#include "pull.h"
 +
 +
 +static char pulldim[STRLEN];
 +
 +static void string2dvec(char buf[], dvec nums)
 +{
 +  if (sscanf(buf,"%lf%lf%lf",&nums[0],&nums[1],&nums[2]) != 3)
 +    gmx_fatal(FARGS,"Expected three numbers at input line %s",buf);
 +}
 +
 +static void init_pullgrp(t_pullgrp *pg,char *wbuf,
 +                       gmx_bool bRef,int eGeom,char *s_vec)
 +{
 +  double d;
 +  int    n,m;
 +  dvec   vec;
 +
 +  pg->nweight = 0;
 +  while (sscanf(wbuf,"%lf %n",&d,&n) == 1) {
 +    if (pg->nweight % 100 == 0) {
 +      srenew(pg->weight,pg->nweight+100);
 +    }
 +    pg->weight[pg->nweight++] = d;
 +    wbuf += n;
 +  }
 +  if (!bRef) {
 +    if (eGeom == epullgDIST) {
 +      clear_dvec(vec);
 +    } else {
 +      string2dvec(s_vec,vec);
 +      if (eGeom == epullgDIR || eGeom == epullgCYL || 
 +        (eGeom == epullgPOS && dnorm(vec) != 0))
 +      /* Normalize the direction vector */
 +      dsvmul(1/dnorm(vec),vec,vec);
 +    }
 +    for(m=0; m<DIM; m++)
 +      pg->vec[m] = vec[m];
 +  }
 +}
 +
 +char **read_pullparams(int *ninp_p,t_inpfile **inp_p,
 +                     t_pull *pull,gmx_bool *bStart,
 +                     warninp_t wi) 
 +{
 +  int  ninp,nerror=0,i,nchar,ndim,nscan,m;
 +  t_inpfile *inp;
 +  const char *tmp;
 +  char **grpbuf;
 +  char dummy[STRLEN],buf[STRLEN],init[STRLEN];
 +  const char *init_def1="0.0",*init_def3="0.0 0.0 0.0";
 +  char wbuf[STRLEN],VecTemp[STRLEN];
 +  dvec vec;
 +
 +  t_pullgrp *pgrp;
 +
 +  ninp   = *ninp_p;
 +  inp    = *inp_p;
 +
 +  /* read pull parameters */
 +  CTYPE("Pull geometry: distance, direction, cylinder or position");
 +  EETYPE("pull_geometry",   pull->eGeom, epullg_names);
 +  CTYPE("Select components for the pull vector. default: Y Y Y");
 +  STYPE("pull_dim",         pulldim, "Y Y Y");
 +  CTYPE("Cylinder radius for dynamic reaction force groups (nm)");
 +  RTYPE("pull_r1",          pull->cyl_r1, 1.0);
 +  CTYPE("Switch from r1 to r0 in case of dynamic reaction force");
 +  RTYPE("pull_r0",          pull->cyl_r0, 1.5);
 +  RTYPE("pull_constr_tol",  pull->constr_tol, 1E-6);
 +  EETYPE("pull_start",      *bStart, yesno_names);
 +  ITYPE("pull_nstxout",     pull->nstxout, 10);
 +  ITYPE("pull_nstfout",     pull->nstfout,  1);
 +  CTYPE("Number of pull groups");
 +  ITYPE("pull_ngroups",     pull->ngrp,1);
 +
 +  if (pull->cyl_r1 > pull->cyl_r0) {
 +    warning_error(wi,"pull_r1 > pull_r0");
 +  }
 +
 +  if (pull->ngrp < 1) {
 +    gmx_fatal(FARGS,"pull_ngroups should be >= 1");
 +  }
 +  
 +  snew(pull->grp,pull->ngrp+1);
 +
 +  if (pull->eGeom == epullgPOS) {
 +    ndim = 3;
 +  } else {
 +    ndim = 1;
 +  }
 +
 +  /* pull group options */
 +  CTYPE("Group name, weight (default all 1), vector, init, rate (nm/ps), kJ/(mol*nm^2)");
 +  /* Read the pull groups */
 +  snew(grpbuf,pull->ngrp+1);
 +  for(i=0; i<pull->ngrp+1; i++) {
 +    pgrp = &pull->grp[i];
 +    snew(grpbuf[i],STRLEN);
 +    sprintf(buf,"pull_group%d",i);
 +    STYPE(buf,              grpbuf[i], "");
 +    sprintf(buf,"pull_weights%d",i);
 +    STYPE(buf,              wbuf, "");
 +    sprintf(buf,"pull_pbcatom%d",i);
 +    ITYPE(buf,              pgrp->pbcatom, 0);
 +    if (i > 0) {
 +      sprintf(buf,"pull_vec%d",i);
 +      STYPE(buf,              VecTemp, "0.0 0.0 0.0");
 +      sprintf(buf,"pull_init%d",i);
 +      STYPE(buf,              init, ndim==1 ? init_def1 : init_def3);
 +      nscan = sscanf(init,"%lf %lf %lf",&vec[0],&vec[1],&vec[2]);
 +      if (nscan != ndim) {
 +      fprintf(stderr,"ERROR: %s should have %d components\n",buf,ndim);
 +      nerror++;
 +      }
 +      for(m=0; m<DIM; m++) {
 +      pgrp->init[m] = (m<ndim ? vec[m] : 0.0);
 +      }
 +      sprintf(buf,"pull_rate%d",i);
 +      RTYPE(buf,              pgrp->rate, 0.0);
 +      sprintf(buf,"pull_k%d",i);
 +      RTYPE(buf,              pgrp->k, 0.0);
 +      sprintf(buf,"pull_kB%d",i);
 +      RTYPE(buf,              pgrp->kB, pgrp->k);
 +    }
 +
 +    /* Initialize the pull group */
 +    init_pullgrp(pgrp,wbuf,i==0,pull->eGeom,VecTemp);
 +  }
 +  
 +  *ninp_p   = ninp;
 +  *inp_p    = inp;
 +
 +  return grpbuf;
 +}
 +
 +void make_pull_groups(t_pull *pull,char **pgnames,t_blocka *grps,char **gnames)
 +{
 +  int  d,nchar,g,ig=-1,i;
 +  char *ptr,pulldim1[STRLEN];
 +  t_pullgrp *pgrp;
 +
 +  ptr = pulldim;
 +  i = 0;
 +  for(d=0; d<DIM; d++) {
 +    if (sscanf(ptr,"%s%n",pulldim1,&nchar) != 1)
 +      gmx_fatal(FARGS,"Less than 3 pull dimensions given in pull_dim: '%s'",
 +              pulldim);
 +    
 +    if (gmx_strncasecmp(pulldim1,"N",1) == 0) {
 +      pull->dim[d] = 0;
 +    } else if (gmx_strncasecmp(pulldim1,"Y",1) == 0) {
 +      pull->dim[d] = 1;
 +      i++;
 +    } else {
 +      gmx_fatal(FARGS,"Please use Y(ES) or N(O) for pull_dim only (not %s)",
 +              pulldim1);
 +    }
 +    ptr += nchar;
 +  }
 +  if (i == 0)
 +    gmx_fatal(FARGS,"All entries in pull_dim are N");
 +
 +  for(g=0; g<pull->ngrp+1; g++) {
 +    pgrp = &pull->grp[g];
 +    if (g == 0 && strcmp(pgnames[g],"") == 0) {
 +      pgrp->nat = 0;
 +    } else {
 +      ig = search_string(pgnames[g],grps->nr,gnames);
 +      pgrp->nat = grps->index[ig+1] - grps->index[ig];
 +    }
 +    if (pgrp->nat > 0) {
 +      fprintf(stderr,"Pull group %d '%s' has %d atoms\n",
 +            g,pgnames[g],pgrp->nat);
 +      snew(pgrp->ind,pgrp->nat);
 +      for(i=0; i<pgrp->nat; i++)
 +      pgrp->ind[i] = grps->a[grps->index[ig]+i];
 +
 +      if (pull->eGeom == epullgCYL && g == 0 && pgrp->nweight > 0)
 +      gmx_fatal(FARGS,"Weights are not supported for the reference group with cylinder pulling");
 +      if (pgrp->nweight > 0 && pgrp->nweight != pgrp->nat)
 +      gmx_fatal(FARGS,"Number of weights (%d) for pull group %d '%s' does not match the number of atoms (%d)",
 +                pgrp->nweight,g,pgnames[g],pgrp->nat);
 +
 +      if (pgrp->nat == 1) {
 +      /* No pbc is required for this group */
 +      pgrp->pbcatom = -1;
 +      } else {
 +      if (pgrp->pbcatom > 0) {
 +        pgrp->pbcatom -= 1;
 +      } else if (pgrp->pbcatom == 0) {
 +        pgrp->pbcatom = pgrp->ind[(pgrp->nat-1)/2];
 +      } else {
 +        /* Use cosine weighting */
 +        pgrp->pbcatom = -1;
 +      }
 +      }
 +
 +      if (g > 0 && pull->eGeom != epullgDIST) {
 +      for(d=0; d<DIM; d++) {
 +        if (pgrp->vec[d] != 0 && pull->dim[d] == 0) {
 +          gmx_fatal(FARGS,"ERROR: pull_vec%d has non-zero %c-component while pull_dim in N\n",g,'x'+d);
 +        }
 +      }
 +      }
 +
 +      if ((pull->eGeom == epullgDIR || pull->eGeom == epullgCYL) &&
 +        g > 0 && norm2(pgrp->vec) == 0)
 +      gmx_fatal(FARGS,"pull_vec%d can not be zero with geometry %s",
 +                g,EPULLGEOM(pull->eGeom));
 +      if ((pull->eGeom == epullgPOS) && pgrp->rate != 0 &&
 +        g > 0 && norm2(pgrp->vec) == 0)
 +      gmx_fatal(FARGS,"pull_vec%d can not be zero with geometry %s and non-zero rate",
 +                g,EPULLGEOM(pull->eGeom));
 +    } else {
 +      if (g == 0) {
 +      if (pull->eGeom == epullgCYL)
 +        gmx_fatal(FARGS,"Absolute reference groups are not supported with geometry %s",EPULLGEOM(pull->eGeom));
 +      } else {
 +      gmx_fatal(FARGS,"Pull group %d '%s' is empty",g,pgnames[g]);
 +      }
 +      pgrp->pbcatom = -1;
 +    }
 +  }
 +}
 +
-   real      lambda=0;
++void set_pull_init(t_inputrec *ir,gmx_mtop_t *mtop,rvec *x,matrix box,real lambda,
 +                 const output_env_t oenv,gmx_bool bStart)
 +{
 +  t_mdatoms *md;
 +  t_pull    *pull;
 +  t_pullgrp *pgrp;
 +  t_pbc     pbc;
 +  int       ndim,g,m;
 +  double    t_start,tinvrate;
-   /* need to pass in the correct masses if free energy is on*/
-   if (ir->efep)
-   {
-       lambda = ir->fepvals->all_lambda[efptMASS][ir->fepvals->init_fep_state];
-   }
 +  rvec      init;
 +  dvec      dr,dev;
 +
-     update_mdatoms(md,ir->fepvals->init_lambda);
-   
 +  init_pull(NULL,ir,0,NULL,mtop,NULL,oenv,lambda,FALSE,0); 
 +  md = init_mdatoms(NULL,mtop,ir->efep);
 +  atoms2md(mtop,ir,0,NULL,0,mtop->natoms,md);
 +  if (ir->efep)
++  {
++    update_mdatoms(md,lambda);
++  }
 +  pull = ir->pull;
 +  if (pull->eGeom == epullgPOS)
 +    ndim = 3;
 +  else
 +    ndim = 1;
 +
 +  set_pbc(&pbc,ir->ePBC,box);
 +
 +  t_start = ir->init_t + ir->init_step*ir->delta_t;
 +
 +  pull_calc_coms(NULL,pull,md,&pbc,t_start,x,NULL);
 +
 +  fprintf(stderr,"Pull group  natoms  pbc atom  distance at start     reference at t=0\n");
 +  for(g=0; g<pull->ngrp+1; g++) {
 +    pgrp = &pull->grp[g];
 +    fprintf(stderr,"%8d  %8d  %8d ",g,pgrp->nat,pgrp->pbcatom+1);
 +    copy_rvec(pgrp->init,init);
 +    clear_rvec(pgrp->init);
 +    if (g > 0) {
 +      if (pgrp->rate == 0)
 +      tinvrate = 0;
 +      else
 +      tinvrate = t_start/pgrp->rate;
 +      get_pullgrp_distance(pull,&pbc,g,0,dr,dev);
 +      for(m=0; m<DIM; m++) {
 +      if (m < ndim)
 +        fprintf(stderr," %6.3f",dev[m]);
 +      else
 +        fprintf(stderr,"       ");
 +      }
 +      fprintf(stderr," ");
 +      for(m=0; m<DIM; m++) {
 +      if (bStart)
 +        pgrp->init[m] = init[m] + dev[m]
 +          - tinvrate*(pull->eGeom==epullgPOS ? pgrp->vec[m] : 1);
 +      else
 +        pgrp->init[m] = init[m];
 +      if (m < ndim)
 +        fprintf(stderr," %6.3f",pgrp->init[m]);
 +      else
 +        fprintf(stderr,"       ");
 +      }
 +    }
 +    fprintf(stderr,"\n");
 +  }
 +}
index 6a561fa04b5711dbd0d2a6e996327030c620bb3d,0000000000000000000000000000000000000000..ed8ec05d1005ed4d4beb771cd86ab1aa9a630e04
mode 100644,000000..100644
--- /dev/null
@@@ -1,260 -1,0 +1,277 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-  
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifndef _gmx_x86_avx_256_h_
 +#define _gmx_x86_avx_256_h_
 +
 +
 +#include <immintrin.h>
 +#ifdef HAVE_X86INTRIN_H
 +#include <x86intrin.h> /* FMA */
 +#endif
 +
 +
 +#include <stdio.h>
 +
 +#include "types/simple.h"
 +
 +
 +#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
 +
 +#define _GMX_MM_BLEND256D(b3,b2,b1,b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
 +#define _GMX_MM_PERMUTE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 +#define _GMX_MM_PERMUTE256D(fp3,fp2,fp1,fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
 +#define _GMX_MM_PERMUTE128D(fp1,fp0)         (((fp1) << 1) | ((fp0)))
 +
 +
 +#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
 +    __m128d __gmx_t1 = row0;                         \
 +    row0           = _mm_unpacklo_pd(row0,row1);     \
 +    row1           = _mm_unpackhi_pd(__gmx_t1,row1); \
 +}
 +
 +#define GMX_MM256_FULLTRANSPOSE4_PD(row0,row1,row2,row3) \
 +{                                                        \
 +    __m256d _t0, _t1, _t2, _t3;                          \
 +    _t0  = _mm256_unpacklo_pd((row0), (row1));           \
 +    _t1  = _mm256_unpackhi_pd((row0), (row1));           \
 +    _t2  = _mm256_unpacklo_pd((row2), (row3));           \
 +    _t3  = _mm256_unpackhi_pd((row2), (row3));           \
 +    row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
 +    row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
 +    row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
 +    row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
 +}
 +
 +#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
 +#  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
 +#  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
 +#  define gmx_mm_castps_ps128(a) (a)
 +#  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
 +#  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
 +#elif defined(__GNUC__)
 +#  define gmx_mm_castsi128_ps(a) ((__m128)(a))
 +#  define gmx_mm_castps_si128(a) ((__m128i)(a))
 +#  define gmx_mm_castps_ps128(a) ((__m128)(a))
 +#  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
 +#  define gmx_mm_castpd_si128(a) ((__m128i)(a))
 +#else
 +static __m128  gmx_mm_castsi128_ps(__m128i a)
 +{
 +    return *(__m128 *) &a;
 +}
 +static __m128i gmx_mm_castps_si128(__m128 a)
 +{
 +    return *(__m128i *) &a;
 +}
 +static __m128  gmx_mm_castps_ps128(__m128 a)
 +{
 +    return *(__m128 *) &a;
 +}
 +static __m128d gmx_mm_castsi128_pd(__m128i a)
 +{
 +    return *(__m128d *) &a;
 +}
 +static __m128i gmx_mm_castpd_si128(__m128d a)
 +{
 +    return *(__m128i *) &a;
 +}
 +#endif
 +
 +static gmx_inline __m256
 +gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
 +{
 +    return _mm256_permute2f128_ps(xmm1,xmm2,0x20);
 +}
 +
 +static gmx_inline __m256
 +gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
 +{
 +    return _mm256_permute2f128_ps(xmm1,xmm2,0x31);
 +}
 +
 +static gmx_inline __m256
 +gmx_mm256_set_m128(__m128 hi, __m128 lo)
 +{
 +    return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
 +}
 +
 +
++static gmx_inline __m256
++gmx_mm256_load4_ps(float const * p)
++{
++    __m128 a;
++
++    a = _mm_load_ps(p);
++    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
++}
++
++
 +static __m256d
 +gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
 +{
 +    return _mm256_permute2f128_pd(xmm1,xmm2,0x20);
 +}
 +
 +static __m256d
 +gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
 +{
 +    return _mm256_permute2f128_pd(xmm1,xmm2,0x31);
 +}
 +
 +static __m256d
 +gmx_mm256_set_m128d(__m128d hi, __m128d lo)
 +{
 +    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
 +}
 +
 +
++static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
++{
++    __m256 sum;
++
++    sum = _mm256_add_ps(x,y);
++    return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
++}
 +
 +
 +static void
 +gmx_mm_printxmm_ps(const char *s,__m128 xmm)
 +{
 +    float f[4];
 +
 +    _mm_storeu_ps(f,xmm);
 +    printf("%s: %15.10e %15.10e %15.10e %15.10e\n",s,f[0],f[1],f[2],f[3]);
 +}
 +
 +
 +static void
 +gmx_mm_printxmmsum_ps(const char *s,__m128 xmm)
 +{
 +    float f[4];
 +
 +    _mm_storeu_ps(f,xmm);
 +    printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 +}
 +
 +
 +static void
 +gmx_mm_printxmm_pd(const char *s,__m128d xmm)
 +{
 +    double f[2];
 +
 +    _mm_storeu_pd(f,xmm);
 +    printf("%s: %30.20e %30.20e\n",s,f[0],f[1]);
 +}
 +
 +static void
 +gmx_mm_printxmmsum_pd(const char *s,__m128d xmm)
 +{
 +    double f[2];
 +
 +    _mm_storeu_pd(f,xmm);
 +    printf("%s (sum): %15.10g\n",s,f[0]+f[1]);
 +}
 +
 +
 +static void
 +gmx_mm_printxmm_epi32(const char *s,__m128i xmmi)
 +{
 +    int i[4];
 +
 +    _mm_storeu_si128((__m128i *)i,xmmi);
 +    printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
 +}
 +
 +static void
 +gmx_mm256_printymm_ps(const char *s,__m256 ymm)
 +{
 +    float f[8];
 +
 +    _mm256_storeu_ps(f,ymm);
 +    printf("%s: %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f\n",s,f[0],f[1],f[2],f[3],f[4],f[5],f[6],f[7]);
 +}
 +
 +static void
 +gmx_mm256_printymmsum_ps(const char *s,__m256 ymm)
 +{
 +    float f[8];
 +
 +    _mm256_storeu_ps(f,ymm);
 +    printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]+f[4]+f[5]+f[6]+f[7]);
 +}
 +
 +
 +static void
 +gmx_mm256_printymm_pd(const char *s,__m256d ymm)
 +{
 +    double f[4];
 +
 +    _mm256_storeu_pd(f,ymm);
 +    printf("%s: %16.12f %16.12f %16.12f %16.12f\n",s,f[0],f[1],f[2],f[3]);
 +}
 +
 +static void
 +gmx_mm256_printymmsum_pd(const char *s,__m256d ymm)
 +{
 +    double f[4];
 +
 +    _mm256_storeu_pd(f,ymm);
 +    printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 +}
 +
 +
 +
 +static void
 +gmx_mm256_printymm_epi32(const char *s,__m256i ymmi)
 +{
 +    int i[8];
 +
 +    _mm256_storeu_si256((__m256i *)i,ymmi);
 +    printf("%10s: %2d %2d %2d %2d %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7]);
 +}
 +
 +
 +
 +static int gmx_mm_check_and_reset_overflow(void)
 +{
 +    int MXCSR;
 +    int sse_overflow;
 +
 +    MXCSR = _mm_getcsr();
 +    /* The overflow flag is bit 3 in the register */
 +    if (MXCSR & 0x0008)
 +    {
 +        sse_overflow = 1;
 +        /* Set the overflow flag to zero */
 +        MXCSR = MXCSR & 0xFFF7;
 +        _mm_setcsr(MXCSR);
 +    }
 +    else
 +    {
 +        sse_overflow = 0;
 +    }
 +
 +    return sse_overflow;
 +}
 +
 +
 +
 +#endif /* _gmx_x86_avx_256_h_ */
index 4bf927b9088659637e83d94cb9f44c6478db641d,0000000000000000000000000000000000000000..f1029da4150692bd371eab620528bc3df8406d18
mode 100644,000000..100644
--- /dev/null
@@@ -1,98 -1,0 +1,151 @@@
- /*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
- enum { nbkNotSet = 0, 
-        nbk4x4_PlainC, 
-        nbk4xN_X86_SIMD128,
-        nbk4xN_X86_SIMD256,
-        nbk8x8x8_CUDA,
-        nbk8x8x8_PlainC };
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef NB_VERLET_H
 +#define NB_VERLET_H
 +
 +#include "nbnxn_pairlist.h"
 +#include "nbnxn_cuda_types_ext.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
-     nbnxn_search_t           nbs;   /* n vs n atom pair searching data          */
-     int                      ngrp;  /* number of interaction groups             */
-     nonbonded_verlet_group_t grp[2];/* local and non-local interaction group    */
++#ifdef GMX_X86_SSE2
++/* Use SIMD accelerated nbnxn search and kernels */
++#define GMX_NBNXN_SIMD
++
++#ifdef GMX_X86_AVX_256
++/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
++#define GMX_NBNXN_SIMD_BITWIDTH  256
++#else
++#define GMX_NBNXN_SIMD_BITWIDTH  128
++#endif
++
++/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
++ * Currently the 2xNN SIMD kernels only make sense and are only implemented
++ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
++ */
++#define GMX_NBNXN_SIMD_4XN
++#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
++#define GMX_NBNXN_SIMD_2XNN
++#endif
++
++#endif
++
++
++/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
++typedef enum
++{
++    nbnxnkNotSet = 0, 
++    nbnxnk4x4_PlainC, 
++    nbnxnk4xN_SIMD_4xN,
++    nbnxnk4xN_SIMD_2xNN,
++    nbnxnk8x8x8_CUDA,
++    nbnxnk8x8x8_PlainC,
++    nbnxnkNR
++} nbnxn_kernel_type;
++
++/* Note that _mm_... intrinsics can be converted to either SSE or AVX
++ * depending on compiler flags.
++ * For gcc we check for __AVX__
++ * At least a check for icc should be added (if there is a macro)
++ */
++static const char *nbnxn_kernel_name[nbnxnkNR] =
++  { "not set", "plain C",
++#if !(defined GMX_X86_SSE2)
++    "not available", "not available",
++#else
++#if GMX_NBNXN_SIMD_BITWIDTH == 128
++#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
++#ifndef GMX_X86_SSE4_1
++    "SSE2", "SSE2",
++#else
++    "SSE4.1", "SSE4.1",
++#endif
++#else
++    "AVX-128", "AVX-128",
++#endif
++#else
++    "AVX-256",  "AVX-256",
++#endif
++#endif
++    "CUDA", "plain C" };
 +
 +enum { ewaldexclTable, ewaldexclAnalytical };
 +
 +/* Atom locality indicator: local, non-local, all, used for calls to:
 +   gridding, pair-search, force calculation, x/f buffer operations */
 +enum { eatLocal = 0, eatNonlocal = 1, eatAll  };
 +
 +#define LOCAL_A(x)               ((x) == eatLocal)
 +#define NONLOCAL_A(x)            ((x) == eatNonlocal)
 +#define LOCAL_OR_NONLOCAL_A(x)   (LOCAL_A(x) || NONLOCAL_A(x))
 +
 +/* Interaction locality indicator (used in pair-list search/calculations):
 +    - local interactions require local atom data and affect local output only;
 +    - non-local interactions require both local and non-local atom data and
 +      affect both local- and non-local output. */
 +enum { eintLocal = 0, eintNonlocal = 1 };
 +
 +#define LOCAL_I(x)               ((x) == eintLocal)
 +#define NONLOCAL_I(x)            ((x) == eintNonlocal)
 +
 +enum { enbvClearFNo, enbvClearFYes };
 +
 +typedef struct {
 +    nbnxn_pairlist_set_t nbl_lists;   /* pair list(s)                       */
 +    nbnxn_atomdata_t     *nbat;       /* atom data                          */
 +    int                  kernel_type; /* non-bonded kernel - see enum above */
 +    int                  ewald_excl;  /* Ewald exclusion - see enum above   */
 +} nonbonded_verlet_group_t;
 +
 +/* non-bonded data structure with Verlet-type cut-off */
 +typedef struct {
++    nbnxn_search_t           nbs;   /* n vs n atom pair searching data       */
++    int                      ngrp;  /* number of interaction groups          */
++    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
 +
 +    gmx_bool         bUseGPU;          /* TRUE when GPU acceleration is used */
 +    nbnxn_cuda_ptr_t cu_nbv;           /* pointer to CUDA nb verlet data     */
 +    int              min_ci_balanced;  /* pair list balancing parameter
 +                                          used for the 8x8x8 CUDA kernels    */
 +} nonbonded_verlet_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif /* NB_VERLET_H */
index 46b55b15f7a46a6b001e5aa88d5b3145fee2da2d,0000000000000000000000000000000000000000..a7df6376fd7e1c587eb9306cf82f682531e6e9b8
mode 100644,000000..100644
--- /dev/null
@@@ -1,1665 -1,0 +1,1746 @@@
-     for(b=0; b<ncons; b++)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "main.h"
 +#include "constr.h"
 +#include "copyrite.h"
 +#include "physics.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "smalloc.h"
 +#include "mdrun.h"
 +#include "nrnb.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gmx_omp.h"
 +
 +typedef struct {
 +    int b0;           /* first constraint for this thread */
 +    int b1;           /* b1-1 is the last constraint for this thread */
 +    int nind;         /* number of indices */
 +    int *ind;         /* constraint index for updating atom data */
 +    int nind_r;       /* number of indices */
 +    int *ind_r;       /* constraint index for updating atom data */
 +    int ind_nalloc;   /* allocation size of ind and ind_r */
 +    tensor vir_r_m_dr;/* temporary variable for virial calculation */
 +} lincs_thread_t;
 +
 +typedef struct gmx_lincsdata {
 +    int  ncg;         /* the global number of constraints */
 +    int  ncg_flex;    /* the global number of flexible constraints */
 +    int  ncg_triangle;/* the global number of constraints in triangles */
 +    int  nIter;       /* the number of iterations */
 +    int  nOrder;      /* the order of the matrix expansion */
 +    int  nc;          /* the number of constraints */
 +    int  nc_alloc;    /* the number we allocated memory for */
 +    int  ncc;         /* the number of constraint connections */
 +    int  ncc_alloc;   /* the number we allocated memory for */
 +    real matlam;      /* the FE lambda value used for filling blc and blmf */
 +    real *bllen0;     /* the reference distance in topology A */
 +    real *ddist;      /* the reference distance in top B - the r.d. in top A */
 +    int  *bla;        /* the atom pairs involved in the constraints */
 +    real *blc;        /* 1/sqrt(invmass1 + invmass2) */
 +    real *blc1;       /* as blc, but with all masses 1 */
 +    int  *blnr;       /* index into blbnb and blmf */
 +    int  *blbnb;      /* list of constraint connections */
 +    int  ntriangle;   /* the local number of constraints in triangles */
 +    int  *triangle;   /* the list of triangle constraints */
 +    int  *tri_bits;   /* the bits tell if the matrix element should be used */
 +    int  ncc_triangle;/* the number of constraint connections in triangles */
++    gmx_bool bCommIter; /* communicate before each LINCS interation */
 +    real *blmf;       /* matrix of mass factors for constraint connections */
 +    real *blmf1;      /* as blmf, but with all masses 1 */
 +    real *bllen;      /* the reference bond length */
 +    int  nth;         /* The number of threads doing LINCS */
 +    lincs_thread_t *th; /* LINCS thread division */
 +    unsigned *atf;    /* atom flags for thread parallelization */
 +    int  atf_nalloc;  /* allocation size of atf */
 +    /* arrays for temporary storage in the LINCS algorithm */
 +    rvec *tmpv;
 +    real *tmpncc;
 +    real *tmp1;
 +    real *tmp2;
 +    real *tmp3;
 +    real *tmp4;
 +    real *mlambda;  /* the Lagrange multipliers * -1 */
 +    /* storage for the constraint RMS relative deviation output */
 +    real rmsd_data[3];
 +} t_gmx_lincsdata;
 +
 +real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
 +{
 +    return lincsd->rmsd_data;
 +}
 +
 +real lincs_rmsd(struct gmx_lincsdata *lincsd,gmx_bool bSD2)
 +{
 +    if (lincsd->rmsd_data[0] > 0)
 +    {
 +        return sqrt(lincsd->rmsd_data[bSD2 ? 2 : 1]/lincsd->rmsd_data[0]);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +/* Do a set of nrec LINCS matrix multiplications.
 + * This function will return with up to date thread-local
 + * constraint data, without an OpenMP barrier.
 + */
 +static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
 +                                int b0,int b1,
 +                                const real *blcc,
 +                                real *rhs1,real *rhs2,real *sol)
 +{
 +    int  nrec,rec,b,j,n,nr0,nr1;
 +    real mvb,*swap;
 +    int  ntriangle,tb,bits;
 +    const int *blnr=lincsd->blnr,*blbnb=lincsd->blbnb;
 +    const int *triangle=lincsd->triangle,*tri_bits=lincsd->tri_bits;
 +    
 +    ntriangle = lincsd->ntriangle;
 +    nrec      = lincsd->nOrder;
 +    
 +    for(rec=0; rec<nrec; rec++)
 +    {
 +#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
 +            mvb = 0;
 +            for(n=blnr[b]; n<blnr[b+1]; n++)
 +            {
 +                j = blbnb[n];
 +                mvb = mvb + blcc[n]*rhs1[j];
 +            }
 +            rhs2[b] = mvb;
 +            sol[b]  = sol[b] + mvb;
 +        }
 +        swap = rhs1;
 +        rhs1 = rhs2;
 +        rhs2 = swap;
 +    } /* nrec*(ncons+2*nrtot) flops */
 +    
 +    if (ntriangle > 0)
 +    {
 +        /* Perform an extra nrec recursions for only the constraints
 +         * involved in rigid triangles.
 +         * In this way their accuracy should come close to those of the other
 +         * constraints, since traingles of constraints can produce eigenvalues
 +         * around 0.7, while the effective eigenvalue for bond constraints
 +         * is around 0.4 (and 0.7*0.7=0.5).
 +         */
 +        /* We need to copy the temporary array, since only the elements
 +         * for constraints involved in triangles are updated and then
 +         * the pointers are swapped. This saving copying the whole arrary.
 +         * We need barrier as other threads might still be reading from rhs2.
 +         */
 +#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
 +            rhs2[b] = rhs1[b];
 +        }
 +#pragma omp barrier
 +#pragma omp master
 +        {
 +            for(rec=0; rec<nrec; rec++)
 +            {
 +                for(tb=0; tb<ntriangle; tb++)
 +                {
 +                    b    = triangle[tb];
 +                    bits = tri_bits[tb];
 +                    mvb = 0;
 +                    nr0 = blnr[b];
 +                    nr1 = blnr[b+1];
 +                    for(n=nr0; n<nr1; n++)
 +                    {
 +                        if (bits & (1<<(n-nr0)))
 +                        {
 +                            j = blbnb[n];
 +                            mvb = mvb + blcc[n]*rhs1[j];
 +                        }
 +                    }
 +                    rhs2[b] = mvb;
 +                    sol[b]  = sol[b] + mvb;
 +                }
 +                swap = rhs1;
 +                rhs1 = rhs2;
 +                rhs2 = swap;
 +            }
 +        } /* flops count is missing here */
 +
 +        /* We need a barrier here as the calling routine will continue
 +         * to operate on the thread-local constraints without barrier.
 +         */
 +#pragma omp barrier
 +    }
 +}
 +
 +static void lincs_update_atoms_noind(int ncons,const int *bla,
 +                                     real prefac,
 +                                     const real *fac,rvec *r,
 +                                     const real *invmass,
 +                                     rvec *x)
 +{
 +    int  b,i,j;
 +    real mvb,im1,im2,tmp0,tmp1,tmp2;
 +
-         i = bla[2*b];
-         j = bla[2*b+1];
-         mvb = prefac*fac[b];
-         im1 = invmass[i];
-         im2 = invmass[j];
-         tmp0 = r[b][0]*mvb;
-         tmp1 = r[b][1]*mvb;
-         tmp2 = r[b][2]*mvb;
-         x[i][0] -= tmp0*im1;
-         x[i][1] -= tmp1*im1;
-         x[i][2] -= tmp2*im1;
-         x[j][0] += tmp0*im2;
-         x[j][1] += tmp1*im2;
-         x[j][2] += tmp2*im2;
-     } /* 16 ncons flops */
++    if (invmass != NULL)
 +    {
-     for(bi=0; bi<ncons; bi++)
++        for(b=0; b<ncons; b++)
++        {
++            i = bla[2*b];
++            j = bla[2*b+1];
++            mvb = prefac*fac[b];
++            im1 = invmass[i];
++            im2 = invmass[j];
++            tmp0 = r[b][0]*mvb;
++            tmp1 = r[b][1]*mvb;
++            tmp2 = r[b][2]*mvb;
++            x[i][0] -= tmp0*im1;
++            x[i][1] -= tmp1*im1;
++            x[i][2] -= tmp2*im1;
++            x[j][0] += tmp0*im2;
++            x[j][1] += tmp1*im2;
++            x[j][2] += tmp2*im2;
++        } /* 16 ncons flops */
++    }
++    else
++    {
++        for(b=0; b<ncons; b++)
++        {
++            i = bla[2*b];
++            j = bla[2*b+1];
++            mvb = prefac*fac[b];
++            tmp0 = r[b][0]*mvb;
++            tmp1 = r[b][1]*mvb;
++            tmp2 = r[b][2]*mvb;
++            x[i][0] -= tmp0;
++            x[i][1] -= tmp1;
++            x[i][2] -= tmp2;
++            x[j][0] += tmp0;
++            x[j][1] += tmp1;
++            x[j][2] += tmp2;
++        }
++    }        
 +}
 +
 +static void lincs_update_atoms_ind(int ncons,const int *ind,const int *bla,
 +                                   real prefac,
 +                                   const real *fac,rvec *r,
 +                                   const real *invmass,
 +                                   rvec *x)
 +{
 +    int  bi,b,i,j;
 +    real mvb,im1,im2,tmp0,tmp1,tmp2;
 +
-         b = ind[bi];
-         i = bla[2*b];
-         j = bla[2*b+1];
-         mvb = prefac*fac[b];
-         im1 = invmass[i];
-         im2 = invmass[j];
-         tmp0 = r[b][0]*mvb;
-         tmp1 = r[b][1]*mvb;
-         tmp2 = r[b][2]*mvb;
-         x[i][0] -= tmp0*im1;
-         x[i][1] -= tmp1*im1;
-         x[i][2] -= tmp2*im1;
-         x[j][0] += tmp0*im2;
-         x[j][1] += tmp1*im2;
-         x[j][2] += tmp2*im2;
-     } /* 16 ncons flops */
++    if (invmass != NULL)
 +    {
-     if (econq != econqForce)
++        for(bi=0; bi<ncons; bi++)
++        {
++            b = ind[bi];
++            i = bla[2*b];
++            j = bla[2*b+1];
++            mvb = prefac*fac[b];
++            im1 = invmass[i];
++            im2 = invmass[j];
++            tmp0 = r[b][0]*mvb;
++            tmp1 = r[b][1]*mvb;
++            tmp2 = r[b][2]*mvb;
++            x[i][0] -= tmp0*im1;
++            x[i][1] -= tmp1*im1;
++            x[i][2] -= tmp2*im1;
++            x[j][0] += tmp0*im2;
++            x[j][1] += tmp1*im2;
++            x[j][2] += tmp2*im2;
++        } /* 16 ncons flops */
++    }
++    else
++    {
++        for(bi=0; bi<ncons; bi++)
++        {
++            b = ind[bi];
++            i = bla[2*b];
++            j = bla[2*b+1];
++            mvb = prefac*fac[b];
++            tmp0 = r[b][0]*mvb;
++            tmp1 = r[b][1]*mvb;
++            tmp2 = r[b][2]*mvb;
++            x[i][0] -= tmp0;
++            x[i][1] -= tmp1;
++            x[i][2] -= tmp2;
++            x[j][0] += tmp0;
++            x[j][1] += tmp1;
++            x[j][2] += tmp2;
++        } /* 16 ncons flops */
++    }
 +}
 +
 +static void lincs_update_atoms(struct gmx_lincsdata *li,int th,
 +                               real prefac,
 +                               const real *fac,rvec *r,
 +                               const real *invmass,
 +                               rvec *x)
 +{
 +    if (li->nth == 1)
 +    {
 +        /* Single thread, we simply update for all constraints */
 +        lincs_update_atoms_noind(li->nc,li->bla,prefac,fac,r,invmass,x);
 +    }
 +    else
 +    {
 +        /* Update the atom vector components for our thread local
 +         * constraints that only access our local atom range.
 +         * This can be done without a barrier.
 +         */
 +        lincs_update_atoms_ind(li->th[th].nind,li->th[th].ind,
 +                               li->bla,prefac,fac,r,invmass,x);
 +
 +        if (li->th[li->nth].nind > 0)
 +        {
 +            /* Update the constraints that operate on atoms
 +             * in multiple thread atom blocks on the master thread.
 +             */
 +#pragma omp barrier
 +#pragma omp master
 +            {    
 +                lincs_update_atoms_ind(li->th[li->nth].nind,
 +                                       li->th[li->nth].ind,
 +                                       li->bla,prefac,fac,r,invmass,x);
 +            }
 +        }
 +    }
 +}
 +
 +/* LINCS projection, works on derivatives of the coordinates */
 +static void do_lincsp(rvec *x,rvec *f,rvec *fp,t_pbc *pbc,
 +                      struct gmx_lincsdata *lincsd,int th,
 +                      real *invmass,
 +                      int econq,real *dvdlambda,
 +                      gmx_bool bCalcVir,tensor rmdf)
 +{
 +    int     b0,b1,b,i,j,k,n;
 +    real    tmp0,tmp1,tmp2,im1,im2,mvb,rlen,len,wfac,lam;  
 +    rvec    dx;
 +    int     *bla,*blnr,*blbnb;
 +    rvec    *r;
 +    real    *blc,*blmf,*blcc,*rhs1,*rhs2,*sol;
 +
 +    b0 = lincsd->th[th].b0;
 +    b1 = lincsd->th[th].b1;
 +    
 +    bla    = lincsd->bla;
 +    r      = lincsd->tmpv;
 +    blnr   = lincsd->blnr;
 +    blbnb  = lincsd->blbnb;
 +    if (econq != econqForce)
 +    {
 +        /* Use mass-weighted parameters */
 +        blc  = lincsd->blc;
 +        blmf = lincsd->blmf; 
 +    }
 +    else
 +    {
 +        /* Use non mass-weighted parameters */
 +        blc  = lincsd->blc1;
 +        blmf = lincsd->blmf1;
 +    }
 +    blcc   = lincsd->tmpncc;
 +    rhs1   = lincsd->tmp1;
 +    rhs2   = lincsd->tmp2;
 +    sol    = lincsd->tmp3;
 +    
 +    /* Compute normalized i-j vectors */
 +    if (pbc)
 +    {
 +        for(b=b0; b<b1; b++)
 +        {
 +            pbc_dx_aiuc(pbc,x[bla[2*b]],x[bla[2*b+1]],dx);
 +            unitv(dx,r[b]);
 +        }
 +    }
 +    else
 +    {
 +        for(b=b0; b<b1; b++)
 +        {
 +            rvec_sub(x[bla[2*b]],x[bla[2*b+1]],dx);
 +            unitv(dx,r[b]);
 +        } /* 16 ncons flops */
 +    }
 +    
 +#pragma omp barrier
 +    for(b=b0; b<b1; b++)
 +    {
 +        tmp0 = r[b][0];
 +        tmp1 = r[b][1];
 +        tmp2 = r[b][2];
 +        i = bla[2*b];
 +        j = bla[2*b+1];
 +        for(n=blnr[b]; n<blnr[b+1]; n++)
 +        {
 +            k = blbnb[n];
 +            blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]); 
 +        } /* 6 nr flops */
 +        mvb = blc[b]*(tmp0*(f[i][0] - f[j][0]) +
 +                      tmp1*(f[i][1] - f[j][1]) +    
 +                      tmp2*(f[i][2] - f[j][2]));
 +        rhs1[b] = mvb;
 +        sol[b]  = mvb;
 +        /* 7 flops */
 +    }
 +    /* Together: 23*ncons + 6*nrtot flops */
 +    
 +    lincs_matrix_expand(lincsd,b0,b1,blcc,rhs1,rhs2,sol);
 +    /* nrec*(ncons+2*nrtot) flops */
 +    
 +    if (econq == econqDeriv_FlexCon)
 +    {
 +        /* We only want to constraint the flexible constraints,
 +         * so we mask out the normal ones by setting sol to 0.
 +         */
 +        for(b=b0; b<b1; b++)
 +        {
 +            if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 +            {
 +                sol[b] = 0;
 +            }
 +        }
 +    }
 +
-         lincs_update_atoms(lincsd,th,1.0,sol,r,invmass,fp);
++    /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
++    for(b=b0; b<b1; b++)
 +    {
-     else
++        sol[b] *= blc[b];
 +    }
-             i = bla[2*b];
-             j = bla[2*b+1];
-             mvb = blc[b]*sol[b];
-             tmp0 = r[b][0]*mvb;
-             tmp1 = r[b][1]*mvb;
-             tmp2 = r[b][2]*mvb;
-             fp[i][0] -= tmp0;
-             fp[i][1] -= tmp1;
-             fp[i][2] -= tmp2;
-             fp[j][0] += tmp0;
-             fp[j][1] += tmp1;
-             fp[j][2] += tmp2;
-         }
-         if (dvdlambda != NULL)
-         {
- #pragma omp barrier
-             for(b=b0; b<b1; b++)
-             {
-                 *dvdlambda -= blc[b]*sol[b]*lincsd->ddist[b];
-             }
++
++    /* When constraining forces, we should not use mass weighting,
++     * so we pass invmass=NULL, which results in the use of 1 for all atoms.
++     */
++    lincs_update_atoms(lincsd,th,1.0,sol,r,
++                       (econq != econqForce) ? invmass : NULL,fp);
++
++    if (dvdlambda != NULL)
 +    {
++#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
-             mvb = lincsd->bllen[b]*blc[b]*sol[b];
++            *dvdlambda -= sol[b]*lincsd->ddist[b];
 +        }
 +        /* 10 ncons flops */
 +    }
 +
 +    if (bCalcVir)
 +    {
 +        /* Constraint virial,
 +         * determines sum r_bond x delta f,
 +         * where delta f is the constraint correction
 +         * of the quantity that is being constrained.
 +         */
 +        for(b=b0; b<b1; b++)
 +        {
-         if ((DOMAINDECOMP(cr) && cr->dd->constraints) ||
++            mvb = lincsd->bllen[b]*sol[b];
 +            for(i=0; i<DIM; i++)
 +            {
 +                tmp1 = mvb*r[b][i];
 +                for(j=0; j<DIM; j++)
 +                {
 +                    rmdf[i][j] += tmp1*r[b][j];
 +                }
 +            }
 +        } /* 23 ncons flops */
 +    }
 +}
 +
 +static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
 +                     struct gmx_lincsdata *lincsd,int th,
 +                     real *invmass,
 +                                       t_commrec *cr,
 +                     gmx_bool bCalcLambda,
 +                     real wangle,int *warn,
 +                     real invdt,rvec *v,
 +                     gmx_bool bCalcVir,tensor vir_r_m_dr)
 +{
 +    int     b0,b1,b,i,j,k,n,iter;
 +    real    tmp0,tmp1,tmp2,im1,im2,mvb,rlen,len,len2,dlen2,wfac;
 +    rvec    dx;
 +    int     *bla,*blnr,*blbnb;
 +    rvec    *r;
 +    real    *blc,*blmf,*bllen,*blcc,*rhs1,*rhs2,*sol,*blc_sol,*mlambda;
 +    int     *nlocat;
 +
 +    b0 = lincsd->th[th].b0;
 +    b1 = lincsd->th[th].b1;
 +
 +    bla    = lincsd->bla;
 +    r      = lincsd->tmpv;
 +    blnr   = lincsd->blnr;
 +    blbnb  = lincsd->blbnb;
 +    blc    = lincsd->blc;
 +    blmf   = lincsd->blmf;
 +    bllen  = lincsd->bllen;
 +    blcc   = lincsd->tmpncc;
 +    rhs1   = lincsd->tmp1;
 +    rhs2   = lincsd->tmp2;
 +    sol    = lincsd->tmp3;
 +    blc_sol= lincsd->tmp4;
 +    mlambda= lincsd->mlambda;
 +    
 +    if (DOMAINDECOMP(cr) && cr->dd->constraints)
 +    {
 +        nlocat = dd_constraints_nlocalatoms(cr->dd);
 +    }
 +    else if (PARTDECOMP(cr))
 +    {
 +        nlocat = pd_constraints_nlocalatoms(cr->pd);
 +    }
 +    else
 +    {
 +        nlocat = NULL;
 +    }
 +
 +    if (pbc)
 +    {
 +        /* Compute normalized i-j vectors */
 +        for(b=b0; b<b1; b++)
 +        {
 +            pbc_dx_aiuc(pbc,x[bla[2*b]],x[bla[2*b+1]],dx);
 +            unitv(dx,r[b]);
 +        }
 +#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
 +            for(n=blnr[b]; n<blnr[b+1]; n++)
 +            {
 +                blcc[n] = blmf[n]*iprod(r[b],r[blbnb[n]]);
 +            }
 +            pbc_dx_aiuc(pbc,xp[bla[2*b]],xp[bla[2*b+1]],dx);
 +            mvb = blc[b]*(iprod(r[b],dx) - bllen[b]);
 +            rhs1[b] = mvb;
 +            sol[b]  = mvb;
 +        }
 +    }
 +    else
 +    {
 +        /* Compute normalized i-j vectors */
 +        for(b=b0; b<b1; b++)
 +        {
 +            i = bla[2*b];
 +            j = bla[2*b+1];
 +            tmp0 = x[i][0] - x[j][0];
 +            tmp1 = x[i][1] - x[j][1];
 +            tmp2 = x[i][2] - x[j][2];
 +            rlen = gmx_invsqrt(tmp0*tmp0+tmp1*tmp1+tmp2*tmp2);
 +            r[b][0] = rlen*tmp0;
 +            r[b][1] = rlen*tmp1;
 +            r[b][2] = rlen*tmp2;
 +        } /* 16 ncons flops */
 +
 +#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
 +            tmp0 = r[b][0];
 +            tmp1 = r[b][1];
 +            tmp2 = r[b][2];
 +            len = bllen[b];
 +            i = bla[2*b];
 +            j = bla[2*b+1];
 +            for(n=blnr[b]; n<blnr[b+1]; n++)
 +            {
 +                k = blbnb[n];
 +                blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]); 
 +            } /* 6 nr flops */
 +            mvb = blc[b]*(tmp0*(xp[i][0] - xp[j][0]) +
 +                          tmp1*(xp[i][1] - xp[j][1]) +    
 +                          tmp2*(xp[i][2] - xp[j][2]) - len);
 +            rhs1[b] = mvb;
 +            sol[b]  = mvb;
 +            /* 10 flops */
 +        }
 +        /* Together: 26*ncons + 6*nrtot flops */
 +    }
 +    
 +    lincs_matrix_expand(lincsd,b0,b1,blcc,rhs1,rhs2,sol);
 +    /* nrec*(ncons+2*nrtot) flops */
 +
 +    for(b=b0; b<b1; b++)
 +    {
 +        mlambda[b] = blc[b]*sol[b]; 
 +    }
 +
 +    /* Update the coordinates */
 +    lincs_update_atoms(lincsd,th,1.0,mlambda,r,invmass,xp);
 +
 +    /*     
 +     ********  Correction for centripetal effects  ********  
 +     */
 +  
 +    wfac = cos(DEG2RAD*wangle);
 +    wfac = wfac*wfac;
 +      
 +    for(iter=0; iter<lincsd->nIter; iter++)
 +    {
-     
-     li->nIter  = nIter;
-     li->nOrder = nProjOrder;
++        if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints) ||
 +            PARTDECOMP(cr))
 +        {
 +#pragma omp barrier
 +#pragma omp master
 +            {
 +                 /* Communicate the corrected non-local coordinates */
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    dd_move_x_constraints(cr->dd,box,xp,NULL);
 +                }
 +                else
 +                {
 +                    pd_move_x_constraints(cr,xp,NULL);
 +                }
 +            }
 +        }
 +        
 +#pragma omp barrier
 +        for(b=b0; b<b1; b++)
 +        {
 +            len = bllen[b];
 +            if (pbc)
 +            {
 +                pbc_dx_aiuc(pbc,xp[bla[2*b]],xp[bla[2*b+1]],dx);
 +            }
 +            else
 +            {
 +                rvec_sub(xp[bla[2*b]],xp[bla[2*b+1]],dx);
 +            }
 +            len2 = len*len;
 +            dlen2 = 2*len2 - norm2(dx);
 +            if (dlen2 < wfac*len2 && (nlocat==NULL || nlocat[b]))
 +            {
 +                *warn = b;
 +            }
 +            if (dlen2 > 0)
 +            {
 +                mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
 +            }
 +            else
 +            {
 +                mvb = blc[b]*len;
 +            }
 +            rhs1[b] = mvb;
 +            sol[b]  = mvb;
 +        } /* 20*ncons flops */
 +        
 +        lincs_matrix_expand(lincsd,b0,b1,blcc,rhs1,rhs2,sol);
 +        /* nrec*(ncons+2*nrtot) flops */
 +
 +        for(b=b0; b<b1; b++)
 +        {
 +            mvb = blc[b]*sol[b];
 +            blc_sol[b]  = mvb;
 +            mlambda[b] += mvb;
 +        }
 +
 +        /* Update the coordinates */
 +        lincs_update_atoms(lincsd,th,1.0,blc_sol,r,invmass,xp);
 +    }
 +    /* nit*ncons*(37+9*nrec) flops */
 +
 +    if (v != NULL)
 +    {
 +        /* Update the velocities */
 +        lincs_update_atoms(lincsd,th,invdt,mlambda,r,invmass,v);
 +        /* 16 ncons flops */
 +    }
 +    
 +    if (nlocat != NULL && bCalcLambda)
 +    {
 +        /* In lincs_update_atoms thread might cross-read mlambda */
 +#pragma omp barrier
 +
 +        /* Only account for local atoms */
 +        for(b=b0; b<b1; b++)
 +        {
 +            mlambda[b] *= 0.5*nlocat[b];
 +        }
 +    }
 +
 +    if (bCalcVir)
 +    {
 +        /* Constraint virial */
 +        for(b=b0; b<b1; b++)
 +        {
 +            tmp0 = -bllen[b]*mlambda[b];
 +            for(i=0; i<DIM; i++)
 +            {
 +                tmp1 = tmp0*r[b][i];
 +                for(j=0; j<DIM; j++)
 +                {
 +                    vir_r_m_dr[i][j] -= tmp1*r[b][j];
 +                }
 +            }
 +        } /* 22 ncons flops */
 +    }
 +    
 +    /* Total:
 +     * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
 +     * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
 +     *
 +     * (26+nrec)*ncons + (6+2*nrec)*nrtot
 +     * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
 +     * if nit=1
 +     * (63+nrec)*ncons + (6+4*nrec)*nrtot
 +     */
 +}
 +
 +void set_lincs_matrix(struct gmx_lincsdata *li,real *invmass,real lambda)
 +{
 +    int i,a1,a2,n,k,sign,center;
 +    int end,nk,kk;
 +    const real invsqrt2=0.7071067811865475244;
 +    
 +    for(i=0; (i<li->nc); i++)
 +    {
 +        a1 = li->bla[2*i];
 +        a2 = li->bla[2*i+1];
 +        li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
 +        li->blc1[i] = invsqrt2;
 +    }
 +    
 +    /* Construct the coupling coefficient matrix blmf */
 +    li->ntriangle = 0;
 +    li->ncc_triangle = 0;
 +    for(i=0; (i<li->nc); i++)
 +    {
 +        a1 = li->bla[2*i];
 +        a2 = li->bla[2*i+1];
 +        for(n=li->blnr[i]; (n<li->blnr[i+1]); n++)
 +        {
 +            k = li->blbnb[n];
 +            if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
 +            {
 +                sign = -1;
 +            }
 +            else
 +            {
 +                sign = 1;
 +            }
 +            if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
 +            {
 +                center = a1;
 +                end    = a2;
 +            }
 +            else
 +            {
 +                center = a2;
 +                end    = a1;
 +            }
 +            li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
 +            li->blmf1[n] = sign*0.5;
 +            if (li->ncg_triangle > 0)
 +            {
 +                /* Look for constraint triangles */
 +                for(nk=li->blnr[k]; (nk<li->blnr[k+1]); nk++)
 +                {
 +                    kk = li->blbnb[nk];
 +                    if (kk != i && kk != k &&
 +                        (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
 +                    {
 +                        if (li->ntriangle == 0 || 
 +                            li->triangle[li->ntriangle-1] < i)
 +                        {
 +                            /* Add this constraint to the triangle list */
 +                            li->triangle[li->ntriangle] = i;
 +                            li->tri_bits[li->ntriangle] = 0;
 +                            li->ntriangle++;
 +                            if (li->blnr[i+1] - li->blnr[i] > sizeof(li->tri_bits[0])*8 - 1)
 +                            {
 +                                gmx_fatal(FARGS,"A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
 +                                          li->blnr[i+1] - li->blnr[i],
 +                                          sizeof(li->tri_bits[0])*8-1);
 +                            }
 +                        }
 +                        li->tri_bits[li->ntriangle-1] |= (1<<(n-li->blnr[i]));
 +                        li->ncc_triangle++;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Of the %d constraints %d participate in triangles\n",
 +                li->nc,li->ntriangle);
 +        fprintf(debug,"There are %d couplings of which %d in triangles\n",
 +                li->ncc,li->ncc_triangle);
 +    }
 +    
 +    /* Set matlam,
 +     * so we know with which lambda value the masses have been set.
 +     */
 +    li->matlam = lambda;
 +}
 +
 +static int count_triangle_constraints(t_ilist *ilist,t_blocka *at2con)
 +{
 +    int  ncon1,ncon_tot;
 +    int  c0,a00,a01,n1,c1,a10,a11,ac1,n2,c2,a20,a21;
 +    int  ncon_triangle;
 +    gmx_bool bTriangle;
 +    t_iatom *ia1,*ia2,*iap;
 +    
 +    ncon1    = ilist[F_CONSTR].nr/3;
 +    ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
 +
 +    ia1 = ilist[F_CONSTR].iatoms;
 +    ia2 = ilist[F_CONSTRNC].iatoms;
 +    
 +    ncon_triangle = 0;
 +    for(c0=0; c0<ncon_tot; c0++)
 +    {
 +        bTriangle = FALSE;
 +        iap = constr_iatomptr(ncon1,ia1,ia2,c0);
 +        a00 = iap[1];
 +        a01 = iap[2];
 +        for(n1=at2con->index[a01]; n1<at2con->index[a01+1]; n1++)
 +        {
 +            c1 = at2con->a[n1];
 +            if (c1 != c0)
 +            {
 +                iap = constr_iatomptr(ncon1,ia1,ia2,c1);
 +                a10 = iap[1];
 +                a11 = iap[2];
 +                if (a10 == a01)
 +                {
 +                    ac1 = a11;
 +                }
 +                else
 +                {
 +                    ac1 = a10;
 +                }
 +                for(n2=at2con->index[ac1]; n2<at2con->index[ac1+1]; n2++)
 +                {
 +                    c2 = at2con->a[n2];
 +                    if (c2 != c0 && c2 != c1)
 +                    {
 +                        iap = constr_iatomptr(ncon1,ia1,ia2,c2);
 +                        a20 = iap[1];
 +                        a21 = iap[2];
 +                        if (a20 == a00 || a21 == a00)
 +                        {
 +                            bTriangle = TRUE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        if (bTriangle)
 +        {
 +            ncon_triangle++;
 +        }
 +    }
 +    
 +    return ncon_triangle;
 +}
 +
++static gmx_bool more_than_two_sequential_constraints(const t_ilist *ilist,
++                                                     const t_blocka *at2con)
++{
++    t_iatom  *ia1,*ia2,*iap;
++    int      ncon1,ncon_tot,c;
++    int      a1,a2;
++    gmx_bool bMoreThanTwoSequentialConstraints;
++
++    ncon1    = ilist[F_CONSTR].nr/3;
++    ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
++
++    ia1 = ilist[F_CONSTR].iatoms;
++    ia2 = ilist[F_CONSTRNC].iatoms;
++
++    bMoreThanTwoSequentialConstraints = FALSE;
++    for(c=0; c<ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
++    {
++        iap = constr_iatomptr(ncon1,ia1,ia2,c);
++        a1 = iap[1];
++        a2 = iap[2];
++        /* Check if this constraint has constraints connected at both atoms */
++        if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
++            at2con->index[a2+1] - at2con->index[a2] > 1)
++        {
++            bMoreThanTwoSequentialConstraints = TRUE;
++        }
++    }
++
++    return bMoreThanTwoSequentialConstraints;
++}
++
 +static int int_comp(const void *a,const void *b)
 +{
 +    return (*(int *)a) - (*(int *)b);
 +}
 +
 +gmx_lincsdata_t init_lincs(FILE *fplog,gmx_mtop_t *mtop,
 +                           int nflexcon_global,t_blocka *at2con,
 +                           gmx_bool bPLINCS,int nIter,int nProjOrder)
 +{
 +    struct gmx_lincsdata *li;
 +    int mb;
 +    gmx_moltype_t *molt;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nInitializing%s LINear Constraint Solver\n",
 +                bPLINCS ? " Parallel" : "");
 +    }
 +    
 +    snew(li,1);
 +    
 +    li->ncg      =
 +        gmx_mtop_ftype_count(mtop,F_CONSTR) +
 +        gmx_mtop_ftype_count(mtop,F_CONSTRNC);
 +    li->ncg_flex = nflexcon_global;
 +    
++    li->nIter  = nIter;
++    li->nOrder = nProjOrder;
++
 +    li->ncg_triangle = 0;
++    li->bCommIter = FALSE;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        li->ncg_triangle +=
 +            mtop->molblock[mb].nmol*
 +            count_triangle_constraints(molt->ilist,
 +                                       &at2con[mtop->molblock[mb].type]);
++        if (bPLINCS && li->bCommIter == FALSE)
++        {
++            /* Check if we need to communicate not only before LINCS,
++             * but also before each iteration.
++             * The check for only two sequential constraints is only
++             * useful for the common case of H-bond only constraints.
++             * With more effort we could also make it useful for small
++             * molecules with nr. sequential constraints <= nOrder-1.
++             */
++            li->bCommIter = (li->nOrder < 1 || more_than_two_sequential_constraints(molt->ilist,&at2con[mtop->molblock[mb].type]));
++        }
++    }
++    if (debug && bPLINCS)
++    {
++        fprintf(debug,"PLINCS communication before each iteration: %d\n",
++                li->bCommIter);
 +    }
 +
 +    /* LINCS can run on any number of threads.
 +     * Currently the number is fixed for the whole simulation,
 +     * but it could be set in set_lincs().
 +     */
 +    li->nth = gmx_omp_nthreads_get(emntLINCS);
 +    if (li->nth == 1)
 +    {
 +        snew(li->th,1);
 +    }
 +    else
 +    {
 +        /* Allocate an extra elements for "thread-overlap" constraints */
 +        snew(li->th,li->nth+1);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"LINCS: using %d threads\n",li->nth);
 +    }
 +
 +    if (bPLINCS || li->ncg_triangle > 0)
 +    {
 +        please_cite(fplog,"Hess2008a");
 +    }
 +    else
 +    {
 +        please_cite(fplog,"Hess97a");
 +    }
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"The number of constraints is %d\n",li->ncg);
 +        if (bPLINCS)
 +        {
 +            fprintf(fplog,"There are inter charge-group constraints,\n"
 +                    "will communicate selected coordinates each lincs iteration\n");
 +        }
 +        if (li->ncg_triangle > 0)
 +        {
 +            fprintf(fplog,
 +                    "%d constraints are involved in constraint triangles,\n"
 +                    "will apply an additional matrix expansion of order %d for couplings\n"
 +                    "between constraints inside triangles\n",
 +                    li->ncg_triangle,li->nOrder);
 +        }
 +    }
 +    
 +    return li;
 +}
 +
 +/* Sets up the work division over the threads */
 +static void lincs_thread_setup(struct gmx_lincsdata *li,int natoms)
 +{
 +    lincs_thread_t *li_m;
 +    int th;
 +    unsigned *atf;
 +    int a;
 +
 +    if (natoms > li->atf_nalloc)
 +    {
 +        li->atf_nalloc = over_alloc_large(natoms);
 +        srenew(li->atf,li->atf_nalloc);
 +    }
 +
 +    atf = li->atf;
 +    /* Clear the atom flags */
 +    for(a=0; a<natoms; a++)
 +    {
 +        atf[a] = 0;
 +    }
 +
 +    for(th=0; th<li->nth; th++)
 +    {
 +        lincs_thread_t *li_th;
 +        int b;
 +
 +        li_th = &li->th[th];
 +        
 +        /* The constraints are divided equally over the threads */
 +        li_th->b0 = (li->nc* th   )/li->nth;
 +        li_th->b1 = (li->nc*(th+1))/li->nth;
 +
 +        if (th < sizeof(*atf)*8)
 +        {
 +            /* For each atom set a flag for constraints from each */
 +            for(b=li_th->b0; b<li_th->b1; b++)
 +            {
 +                atf[li->bla[b*2]  ] |= (1U<<th);
 +                atf[li->bla[b*2+1]] |= (1U<<th);
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for num_threads(li->nth) schedule(static)
 +    for(th=0; th<li->nth; th++)
 +    {
 +        lincs_thread_t *li_th;
 +        unsigned mask;
 +        int b;
 +
 +        li_th = &li->th[th];
 +        
 +        if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
 +        {
 +            li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
 +            srenew(li_th->ind,li_th->ind_nalloc);
 +            srenew(li_th->ind_r,li_th->ind_nalloc);
 +        }
 +
 +        if (th < sizeof(*atf)*8)
 +        {
 +            mask = (1U<<th) - 1U;
 +
 +            li_th->nind   = 0;
 +            li_th->nind_r = 0;
 +            for(b=li_th->b0; b<li_th->b1; b++)
 +            {
 +                /* We let the constraint with the lowest thread index
 +                 * operate on atoms with constraints from multiple threads.
 +                 */
 +                if (((atf[li->bla[b*2]]   & mask) == 0) &&
 +                    ((atf[li->bla[b*2+1]] & mask) == 0))
 +                {
 +                    /* Add the constraint to the local atom update index */
 +                    li_th->ind[li_th->nind++] = b;
 +                }
 +                else
 +                {
 +                    /* Add the constraint to the rest block */
 +                    li_th->ind_r[li_th->nind_r++] = b;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* We are out of bits, assign all constraints to rest */
 +            for(b=li_th->b0; b<li_th->b1; b++)
 +            {
 +                li_th->ind_r[li_th->nind_r++] = b;
 +            }
 +        }
 +    }
 +
 +    /* We need to copy all constraints which have not be assigned
 +     * to a thread to a separate list which will be handled by one thread.
 +     */
 +    li_m = &li->th[li->nth];
 +
 +    li_m->nind = 0;
 +    for(th=0; th<li->nth; th++)
 +    {
 +        lincs_thread_t *li_th;
 +        int b;
 +
 +        li_th   = &li->th[th];
 +
 +        if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
 +        {
 +            li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
 +            srenew(li_m->ind,li_m->ind_nalloc);
 +        }
 +
 +        for(b=0; b<li_th->nind_r; b++)
 +        {
 +            li_m->ind[li_m->nind++] = li_th->ind_r[b];
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"LINCS thread %d: %d constraints\n",
 +                    th,li_th->nind);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"LINCS thread r: %d constraints\n",
 +                li_m->nind);
 +    }
 +}
 +
 +
 +void set_lincs(t_idef *idef,t_mdatoms *md,
 +               gmx_bool bDynamics,t_commrec *cr,
 +               struct gmx_lincsdata *li)
 +{
 +    int      start,natoms,nflexcon;
 +    t_blocka at2con;
 +    t_iatom  *iatom;
 +    int      i,k,ncc_alloc,ni,con,nconnect,concon;
 +    int      type,a1,a2;
 +    real     lenA=0,lenB;
 +    gmx_bool     bLocal;
 +
 +    li->nc = 0;
 +    li->ncc = 0;
 +    /* Zero the thread index ranges.
 +     * Otherwise without local constraints we could return with old ranges.
 +     */
 +    for(i=0; i<li->nth; i++)
 +    {
 +        li->th[i].b0   = 0;
 +        li->th[i].b1   = 0;
 +        li->th[i].nind = 0;
 +    }
 +    if (li->nth > 1)
 +    {
 +        li->th[li->nth].nind = 0;
 +    }
 +              
 +    /* This is the local topology, so there are only F_CONSTR constraints */
 +    if (idef->il[F_CONSTR].nr == 0)
 +    {
 +        /* There are no constraints,
 +         * we do not need to fill any data structures.
 +         */
 +        return;
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Building the LINCS connectivity\n");
 +    }
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (cr->dd->constraints)
 +        {
 +            dd_get_constraint_range(cr->dd,&start,&natoms);
 +        }
 +        else
 +        {
 +            natoms = cr->dd->nat_home;
 +        }
 +        start = 0;
 +    }
 +    else if(PARTDECOMP(cr))
 +      {
 +              pd_get_constraint_range(cr->pd,&start,&natoms);
 +      }
 +      else
 +    {
 +        start  = md->start;
 +        natoms = md->homenr;
 +    }
 +    at2con = make_at2con(start,natoms,idef->il,idef->iparams,bDynamics,
 +                         &nflexcon);
 +
 +      
 +    if (idef->il[F_CONSTR].nr/3 > li->nc_alloc || li->nc_alloc == 0)
 +    {
 +        li->nc_alloc = over_alloc_dd(idef->il[F_CONSTR].nr/3);
 +        srenew(li->bllen0,li->nc_alloc);
 +        srenew(li->ddist,li->nc_alloc);
 +        srenew(li->bla,2*li->nc_alloc);
 +        srenew(li->blc,li->nc_alloc);
 +        srenew(li->blc1,li->nc_alloc);
 +        srenew(li->blnr,li->nc_alloc+1);
 +        srenew(li->bllen,li->nc_alloc);
 +        srenew(li->tmpv,li->nc_alloc);
 +        srenew(li->tmp1,li->nc_alloc);
 +        srenew(li->tmp2,li->nc_alloc);
 +        srenew(li->tmp3,li->nc_alloc);
 +        srenew(li->tmp4,li->nc_alloc);
 +        srenew(li->mlambda,li->nc_alloc);
 +        if (li->ncg_triangle > 0)
 +        {
 +            /* This is allocating too much, but it is difficult to improve */
 +            srenew(li->triangle,li->nc_alloc);
 +            srenew(li->tri_bits,li->nc_alloc);
 +        }
 +    }
 +    
 +    iatom = idef->il[F_CONSTR].iatoms;
 +    
 +    ncc_alloc = li->ncc_alloc;
 +    li->blnr[0] = 0;
 +    
 +    ni = idef->il[F_CONSTR].nr/3;
 +
 +    con = 0;
 +    nconnect = 0;
 +    li->blnr[con] = nconnect;
 +    for(i=0; i<ni; i++)
 +    {
 +        bLocal = TRUE;
 +        type = iatom[3*i];
 +        a1   = iatom[3*i+1];
 +        a2   = iatom[3*i+2];
 +        lenA = idef->iparams[type].constr.dA;
 +        lenB = idef->iparams[type].constr.dB;
 +        /* Skip the flexible constraints when not doing dynamics */
 +        if (bDynamics || lenA!=0 || lenB!=0)
 +        {
 +            li->bllen0[con]  = lenA;
 +            li->ddist[con]   = lenB - lenA;
 +            /* Set the length to the topology A length */
 +            li->bllen[con]   = li->bllen0[con];
 +            li->bla[2*con]   = a1;
 +            li->bla[2*con+1] = a2;
 +            /* Construct the constraint connection matrix blbnb */
 +            for(k=at2con.index[a1-start]; k<at2con.index[a1-start+1]; k++)
 +            {
 +                concon = at2con.a[k];
 +                if (concon != i)
 +                {
 +                    if (nconnect >= ncc_alloc)
 +                    {
 +                        ncc_alloc = over_alloc_small(nconnect+1);
 +                        srenew(li->blbnb,ncc_alloc);
 +                    }
 +                    li->blbnb[nconnect++] = concon;
 +                }
 +            }
 +            for(k=at2con.index[a2-start]; k<at2con.index[a2-start+1]; k++)
 +            {
 +                concon = at2con.a[k];
 +                if (concon != i)
 +                {
 +                    if (nconnect+1 > ncc_alloc)
 +                    {
 +                        ncc_alloc = over_alloc_small(nconnect+1);
 +                        srenew(li->blbnb,ncc_alloc);
 +                    }
 +                    li->blbnb[nconnect++] = concon;
 +                }
 +            }
 +            li->blnr[con+1] = nconnect;
 +            
 +            if (cr->dd == NULL)
 +            {
 +                /* Order the blbnb matrix to optimize memory access */
 +                qsort(&(li->blbnb[li->blnr[con]]),li->blnr[con+1]-li->blnr[con],
 +                      sizeof(li->blbnb[0]),int_comp);
 +            }
 +            /* Increase the constraint count */
 +            con++;
 +        }
 +    }
 +    
 +    done_blocka(&at2con);
 +
 +    /* This is the real number of constraints,
 +     * without dynamics the flexible constraints are not present.
 +     */
 +    li->nc = con;
 +    
 +    li->ncc = li->blnr[con];
 +    if (cr->dd == NULL)
 +    {
 +        /* Since the matrix is static, we can free some memory */
 +        ncc_alloc = li->ncc;
 +        srenew(li->blbnb,ncc_alloc);
 +    }
 +    
 +    if (ncc_alloc > li->ncc_alloc)
 +    {
 +        li->ncc_alloc = ncc_alloc;
 +        srenew(li->blmf,li->ncc_alloc);
 +        srenew(li->blmf1,li->ncc_alloc);
 +        srenew(li->tmpncc,li->ncc_alloc);
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Number of constraints is %d, couplings %d\n",
 +                li->nc,li->ncc);
 +    }
 +
 +    if (li->nth == 1)
 +    {
 +        li->th[0].b0 = 0;
 +        li->th[0].b1 = li->nc;
 +    }
 +    else
 +    {
 +        lincs_thread_setup(li,md->nr);
 +    }
 +
 +    set_lincs_matrix(li,md->invmass,md->lambda);
 +}
 +
 +static void lincs_warning(FILE *fplog,
 +                          gmx_domdec_t *dd,rvec *x,rvec *xprime,t_pbc *pbc,
 +                          int ncons,int *bla,real *bllen,real wangle,
 +                          int maxwarn,int *warncount)
 +{
 +    int b,i,j;
 +    rvec v0,v1;
 +    real wfac,d0,d1,cosine;
 +    char buf[STRLEN];
 +    
 +    wfac=cos(DEG2RAD*wangle);
 +    
 +    sprintf(buf,"bonds that rotated more than %g degrees:\n"
 +            " atom 1 atom 2  angle  previous, current, constraint length\n",
 +            wangle);
 +    fprintf(stderr,"%s",buf);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"%s",buf);
 +    }
 +    
 +    for(b=0;b<ncons;b++)
 +    {
 +        i = bla[2*b];
 +        j = bla[2*b+1];
 +        if (pbc)
 +        {
 +            pbc_dx_aiuc(pbc,x[i],x[j],v0);
 +            pbc_dx_aiuc(pbc,xprime[i],xprime[j],v1);
 +        }
 +        else
 +        {
 +            rvec_sub(x[i],x[j],v0);
 +            rvec_sub(xprime[i],xprime[j],v1);
 +        }
 +        d0 = norm(v0);
 +        d1 = norm(v1);
 +        cosine = iprod(v0,v1)/(d0*d1);
 +        if (cosine < wfac)
 +        {
 +            sprintf(buf," %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
 +                    ddglatnr(dd,i),ddglatnr(dd,j),
 +                    RAD2DEG*acos(cosine),d0,d1,bllen[b]);
 +            fprintf(stderr,"%s",buf);
 +            if (fplog)
 +            {
 +                fprintf(fplog,"%s",buf);
 +            }
 +            if (!gmx_isfinite(d1))
 +            {
 +                gmx_fatal(FARGS,"Bond length not finite.");
 +            }
 +
 +            (*warncount)++;
 +        }
 +    }
 +    if (*warncount > maxwarn)
 +    {
 +        too_many_constraint_warnings(econtLINCS,*warncount);
 +    }
 +}
 +
 +static void cconerr(gmx_domdec_t *dd,
 +                    int ncons,int *bla,real *bllen,rvec *x,t_pbc *pbc,
 +                    real *ncons_loc,real *ssd,real *max,int *imax)
 +{
 +    real      len,d,ma,ssd2,r2;
 +    int       *nlocat,count,b,im;
 +    rvec      dx;
 +    
 +    if (dd && dd->constraints)
 +    {
 +        nlocat = dd_constraints_nlocalatoms(dd);
 +    }
 +    else
 +    {
 +        nlocat = 0;
 +    }
 +    
 +    ma = 0;
 +    ssd2 = 0;
 +    im = 0;
 +    count = 0;
 +    for(b=0;b<ncons;b++)
 +    {
 +        if (pbc)
 +        {
 +            pbc_dx_aiuc(pbc,x[bla[2*b]],x[bla[2*b+1]],dx);
 +        }
 +        else {
 +            rvec_sub(x[bla[2*b]],x[bla[2*b+1]],dx);
 +        }
 +        r2 = norm2(dx);
 +        len = r2*gmx_invsqrt(r2);
 +        d = fabs(len/bllen[b]-1);
 +        if (d > ma && (nlocat==NULL || nlocat[b]))
 +        {
 +            ma = d;
 +            im = b;
 +        }
 +        if (nlocat == NULL)
 +        {
 +            ssd2 += d*d;
 +            count++;
 +        }
 +        else
 +        {
 +            ssd2 += nlocat[b]*d*d;
 +            count += nlocat[b];
 +        }
 +    }
 +    
 +    *ncons_loc = (nlocat ? 0.5 : 1)*count;
 +    *ssd       = (nlocat ? 0.5 : 1)*ssd2;
 +    *max = ma;
 +    *imax = im;
 +}
 +
 +static void dump_conf(gmx_domdec_t *dd,struct gmx_lincsdata *li,
 +                      t_blocka *at2con,
 +                      char *name,gmx_bool bAll,rvec *x,matrix box)
 +{
 +    char str[STRLEN];
 +    FILE *fp;
 +    int  ac0,ac1,i;
 +    
 +    dd_get_constraint_range(dd,&ac0,&ac1);
 +    
 +    sprintf(str,"%s_%d_%d_%d.pdb",name,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    fp = gmx_fio_fopen(str,"w");
 +    fprintf(fp,"CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1           1\n",
 +            10*norm(box[XX]),10*norm(box[YY]),10*norm(box[ZZ]),
 +            90.0,90.0,90.0);
 +    for(i=0; i<ac1; i++)
 +    {
 +        if (i < dd->nat_home || (bAll && i >= ac0 && i < ac1))
 +        {
 +            fprintf(fp,"%-6s%5u  %-4.4s%3.3s %c%4d    %8.3f%8.3f%8.3f%6.2f%6.2f\n",
 +                    "ATOM",ddglatnr(dd,i),"C","ALA",' ',i+1,
 +                    10*x[i][XX],10*x[i][YY],10*x[i][ZZ],
 +                    1.0,i<dd->nat_tot ? 0.0 : 1.0);
 +        }
 +    }
 +    if (bAll)
 +    {
 +        for(i=0; i<li->nc; i++)
 +        {
 +            fprintf(fp,"CONECT%5d%5d\n",
 +                    ddglatnr(dd,li->bla[2*i]),
 +                    ddglatnr(dd,li->bla[2*i+1]));
 +        }
 +    }
 +    gmx_fio_fclose(fp);
 +}
 +
 +gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
 +                         t_inputrec *ir,
 +                         gmx_large_int_t step,
 +                         struct gmx_lincsdata *lincsd,t_mdatoms *md,
 +                         t_commrec *cr, 
 +                         rvec *x,rvec *xprime,rvec *min_proj,
 +                         matrix box,t_pbc *pbc,
 +                         real lambda,real *dvdlambda,
 +                         real invdt,rvec *v,
 +                         gmx_bool bCalcVir,tensor vir_r_m_dr,
 +                         int econq,
 +                         t_nrnb *nrnb,
 +                         int maxwarn,int *warncount)
 +{
 +    char  buf[STRLEN],buf2[22],buf3[STRLEN];
 +    int   i,warn,p_imax,error;
 +    real  ncons_loc,p_ssd,p_max=0;
 +    rvec  dx;
 +    gmx_bool  bOK;
 +    
 +    bOK = TRUE;
 +    
 +    if (lincsd->nc == 0 && cr->dd == NULL)
 +    {
 +        if (bLog || bEner)
 +        {
 +            lincsd->rmsd_data[0] = 0;
 +            if (ir->eI == eiSD2 && v == NULL)
 +            {
 +                i = 2;
 +            }
 +            else
 +            {
 +                i = 1;
 +            }
 +            lincsd->rmsd_data[i] = 0;
 +        }
 +        
 +        return bOK;
 +    }
 +    
 +    if (econq == econqCoord)
 +    {
 +        if (ir->efep != efepNO)
 +        {
 +            if (md->nMassPerturbed && lincsd->matlam != md->lambda)
 +            {
 +                set_lincs_matrix(lincsd,md->invmass,md->lambda);
 +            }
 +            
 +            for(i=0; i<lincsd->nc; i++)
 +            {
 +                lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
 +            }
 +        }
 +        
 +        if (lincsd->ncg_flex)
 +        {
 +            /* Set the flexible constraint lengths to the old lengths */
 +            if (pbc != NULL)
 +            {
 +                for(i=0; i<lincsd->nc; i++)
 +                {
 +                    if (lincsd->bllen[i] == 0) {
 +                        pbc_dx_aiuc(pbc,x[lincsd->bla[2*i]],x[lincsd->bla[2*i+1]],dx);
 +                        lincsd->bllen[i] = norm(dx);
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<lincsd->nc; i++)
 +                {
 +                    if (lincsd->bllen[i] == 0)
 +                    {
 +                        lincsd->bllen[i] =
 +                            sqrt(distance2(x[lincsd->bla[2*i]],
 +                                           x[lincsd->bla[2*i+1]]));
 +                    }
 +                }
 +            }
 +        }
 +        
 +        if (bLog && fplog)
 +        {
 +            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
 +                    &ncons_loc,&p_ssd,&p_max,&p_imax);
 +        }
 +
 +        /* This warn var can be updated by multiple threads
 +         * at the same time. But as we only need to detect
 +         * if a warning occured or not, this is not an issue.
 +         */
 +        warn = -1;
 +
 +        /* The OpenMP parallel region of constrain_lincs for coords */
 +#pragma omp parallel num_threads(lincsd->nth)
 +        {
 +            int th=gmx_omp_get_thread_num();
 +
 +            clear_mat(lincsd->th[th].vir_r_m_dr);
 +
 +            do_lincs(x,xprime,box,pbc,lincsd,th,
 +                     md->invmass,cr,
 +                     bCalcVir || (ir->efep != efepNO),
 +                     ir->LincsWarnAngle,&warn,
 +                     invdt,v,bCalcVir,
 +                     th==0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            real dt_2,dvdl=0;
 +            
 +            dt_2 = 1.0/(ir->delta_t*ir->delta_t);
 +            for(i=0; (i<lincsd->nc); i++)
 +            {
 +                dvdl -= lincsd->mlambda[i]*dt_2*lincsd->ddist[i];
 +            }
 +            *dvdlambda += dvdl;
 +              }
 +        
 +        if (bLog && fplog && lincsd->nc > 0)
 +        {
 +            fprintf(fplog,"   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
 +            fprintf(fplog,"       Before LINCS          %.6f    %.6f %6d %6d\n",
 +                    sqrt(p_ssd/ncons_loc),p_max,
 +                    ddglatnr(cr->dd,lincsd->bla[2*p_imax]),
 +                    ddglatnr(cr->dd,lincsd->bla[2*p_imax+1]));
 +        }
 +        if (bLog || bEner)
 +        {
 +            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
 +                    &ncons_loc,&p_ssd,&p_max,&p_imax);
 +            /* Check if we are doing the second part of SD */
 +            if (ir->eI == eiSD2 && v == NULL)
 +            {
 +                i = 2;
 +            }
 +            else
 +            {
 +                i = 1;
 +            }
 +            lincsd->rmsd_data[0] = ncons_loc;
 +            lincsd->rmsd_data[i] = p_ssd;
 +        }
 +        else
 +        {
 +            lincsd->rmsd_data[0] = 0;
 +            lincsd->rmsd_data[1] = 0;
 +            lincsd->rmsd_data[2] = 0;
 +        }
 +        if (bLog && fplog && lincsd->nc > 0)
 +        {
 +            fprintf(fplog,
 +                    "        After LINCS          %.6f    %.6f %6d %6d\n\n",
 +                    sqrt(p_ssd/ncons_loc),p_max,
 +                    ddglatnr(cr->dd,lincsd->bla[2*p_imax]),
 +                    ddglatnr(cr->dd,lincsd->bla[2*p_imax+1]));
 +        }
 +        
 +        if (warn >= 0)
 +        {
 +            if (maxwarn >= 0)
 +            {
 +                cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
 +                        &ncons_loc,&p_ssd,&p_max,&p_imax);
 +                if (MULTISIM(cr))
 +                {
 +                    sprintf(buf3," in simulation %d", cr->ms->sim);
 +                }
 +                else
 +                {
 +                    buf3[0] = 0;
 +                }
 +                sprintf(buf,"\nStep %s, time %g (ps)  LINCS WARNING%s\n"
 +                        "relative constraint deviation after LINCS:\n"
 +                        "rms %.6f, max %.6f (between atoms %d and %d)\n",
 +                        gmx_step_str(step,buf2),ir->init_t+step*ir->delta_t,
 +                        buf3,
 +                        sqrt(p_ssd/ncons_loc),p_max,
 +                        ddglatnr(cr->dd,lincsd->bla[2*p_imax]),
 +                        ddglatnr(cr->dd,lincsd->bla[2*p_imax+1]));
 +                if (fplog)
 +                {
 +                    fprintf(fplog,"%s",buf);
 +                }
 +                fprintf(stderr,"%s",buf);
 +                lincs_warning(fplog,cr->dd,x,xprime,pbc,
 +                              lincsd->nc,lincsd->bla,lincsd->bllen,
 +                              ir->LincsWarnAngle,maxwarn,warncount);
 +            }
 +            bOK = (p_max < 0.5);
 +        }
 +        
 +        if (lincsd->ncg_flex) {
 +            for(i=0; (i<lincsd->nc); i++)
 +                if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
 +                    lincsd->bllen[i] = 0;
 +        }
 +    } 
 +    else
 +    {
 +        /* The OpenMP parallel region of constrain_lincs for derivatives */
 +#pragma omp parallel num_threads(lincsd->nth)
 +        {
 +            int th=gmx_omp_get_thread_num();
 +
 +            do_lincsp(x,xprime,min_proj,pbc,lincsd,th,
 +                      md->invmass,econq,ir->efep != efepNO ? dvdlambda : NULL,
 +                      bCalcVir,th==0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
 +        }
 +    }
 +
 +    if (bCalcVir && lincsd->nth > 1)
 +    {
 +        for(i=1; i<lincsd->nth; i++)
 +        {
 +            m_add(vir_r_m_dr,lincsd->th[i].vir_r_m_dr,vir_r_m_dr);
 +        }
 +    }
 + 
 +    /* count assuming nit=1 */
 +    inc_nrnb(nrnb,eNR_LINCS,lincsd->nc);
 +    inc_nrnb(nrnb,eNR_LINCSMAT,(2+lincsd->nOrder)*lincsd->ncc);
 +    if (lincsd->ntriangle > 0)
 +    {
 +        inc_nrnb(nrnb,eNR_LINCSMAT,lincsd->nOrder*lincsd->ncc_triangle);
 +    }
 +    if (v)
 +    {
 +        inc_nrnb(nrnb,eNR_CONSTR_V,lincsd->nc*2);
 +    }
 +    if (bCalcVir)
 +    {
 +        inc_nrnb(nrnb,eNR_CONSTR_VIR,lincsd->nc);
 +    }
 +
 +    return bOK;
 +}
index c82da14c68d25abcbfa7caa499f13565d6479062,0000000000000000000000000000000000000000..e544cd996590ee5c12a1a83fc39e7d57a55032ad
mode 100644,000000..100644
--- /dev/null
@@@ -1,1261 -1,0 +1,1263 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Groningen Machine for Chemical Simulation
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <algorithm>
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +
 +#ifdef NOGMX
 +#define GMX_PARALLEL_ENV_INITIALIZED 1
 +#else 
 +#ifdef GMX_MPI
 +#define GMX_PARALLEL_ENV_INITIALIZED 1
 +#else
 +#define GMX_PARALLEL_ENV_INITIALIZED 0
 +#endif
 +#endif
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_OPENMP
 +/* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
 +#define FFT5D_THREADS
 +/* requires fftw compiled with openmp */
 +/* #define FFT5D_FFTW_THREADS (now set by cmake) */
 +#endif
 +
 +#include "fft5d.h"
 +#include <float.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "smalloc.h"
 +
 +#ifndef __FLT_EPSILON__
 +#define __FLT_EPSILON__ FLT_EPSILON
 +#define __DBL_EPSILON__ DBL_EPSILON
 +#endif
 +
 +#ifdef NOGMX
 +FILE* debug=0;
 +#endif
 +
 +#include "gmx_fatal.h"
 +
 +
 +#ifdef GMX_FFT_FFTW3 
 +#include "thread_mpi/mutex.h"
 +#include "gromacs/utility/exceptions.h"
 +/* none of the fftw3 calls, except execute(), are thread-safe, so 
 +   we need to serialize them with this mutex. */
 +static tMPI::mutex big_fftw_mutex;
 +#define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 +#define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 +#endif /* GMX_FFT_FFTW3 */
 +
 +/* largest factor smaller than sqrt */
 +static int lfactor(int z) {  
 +      int i;
 +      for (i=static_cast<int>(sqrt(static_cast<double>(z)));;i--)
 +              if (z%i==0) return i;
 +      return 1;
 +}
 +
 +/* largest factor */
 +static int l2factor(int z) {  
 +      int i;
 +      if (z==1) return 1;
 +      for (i=z/2;;i--)
 +              if (z%i==0) return i;
 +      return 1;
 +}
 +
 +/* largest prime factor: WARNING: slow recursion, only use for small numbers */
 +static int lpfactor(int z) {
 +      int f = l2factor(z);
 +      if (f==1) return z;
 +      return std::max(lpfactor(f),lpfactor(z/f));
 +}
 +
 +#ifndef GMX_MPI
 +#ifdef HAVE_GETTIMEOFDAY
 +#include <sys/time.h>
 +double MPI_Wtime() {
 +    struct timeval tv;
 +    gettimeofday(&tv,0);
 +    return tv.tv_sec+tv.tv_usec*1e-6;
 +}
 +#else
 +double MPI_Wtime() {
 +    return 0.0;
 +}
 +#endif
 +#endif
 +
 +static int vmax(int* a, int s) {
 +    int i,max=0;
 +    for (i=0;i<s;i++) 
 +    {
 +        if (a[i]>max) max=a[i];
 +    }
 +    return max;
 +} 
 +
 +
 +/* NxMxK the size of the data
 + * comm communicator to use for fft5d
 + * P0 number of processor in 1st axes (can be null for automatic)
 + * lin is allocated by fft5d because size of array is only known after planning phase 
 + * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 +*/
 +fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 +{
 +
 +    int P[2],bMaster,prank[2],i,t;
 +    int rNG,rMG,rKG;
 +    int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 +    int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 +    int C[3],rC[3],nP[2];
 +    int lsize;
 +    t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 +    fft5d_plan plan;
 +    int s;
 +
 +    /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 +#ifdef GMX_MPI
 +    if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 +    {
 +        MPI_Comm_size(comm[0],&P[0]);
 +        MPI_Comm_rank(comm[0],&prank[0]);
 +    }
 +    else
 +#endif
 +    {
 +        P[0] = 1;
 +        prank[0] = 0;
 +    }
 +#ifdef GMX_MPI
 +    if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 +    {
 +        MPI_Comm_size(comm[1],&P[1]);
 +        MPI_Comm_rank(comm[1],&prank[1]);
 +    }
 +    else
 +#endif
 +    {
 +        P[1] = 1;
 +        prank[1] = 0;
 +    }
 +   
 +    bMaster=(prank[0]==0&&prank[1]==0);
 +   
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 +                P[0],P[1],prank[0],prank[1]);
 +    }
 +    
 +    if (bMaster) {
 +        if (debug) 
 +            fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 +                NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 +        /* The check below is not correct, one prime factor 11 or 13 is ok.
 +        if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 +            printf("WARNING: FFT very slow with prime factors larger 7\n");
 +            printf("Change FFT size or in case you cannot change it look at\n");
 +            printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 +        }
 +        */
 +    }
 +    
 +    if (NG==0 || MG==0 || KG==0) {
 +        if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 +        return 0;
 +    }
 +
 +    rNG=NG;rMG=MG;rKG=KG;
 +    
 +    if (flags&FFT5D_REALCOMPLEX) {
 +        if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 +        else {
 +            if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 +            else KG=KG/2+1;
 +        }
 +    }
 +    
 +    
 +    /*for transpose we need to know the size for each processor not only our own size*/
 +
 +    N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int)); 
 +    M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 +    K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 +    oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 +    oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 +    oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 +    
 +    for (i=0;i<P[0];i++) 
 +    {
 +        #define EVENDIST
 +        #ifndef EVENDIST
 +        oN0[i]=i*ceil((double)NG/P[0]);
 +        oM0[i]=i*ceil((double)MG/P[0]);
 +        oK0[i]=i*ceil((double)KG/P[0]);
 +        #else
 +        oN0[i]=(NG*i)/P[0];
 +        oM0[i]=(MG*i)/P[0];
 +        oK0[i]=(KG*i)/P[0];
 +        #endif
 +    }
 +    for (i=0;i<P[1];i++) 
 +    {
 +        #ifndef EVENDIST
 +        oN1[i]=i*ceil((double)NG/P[1]); 
 +        oM1[i]=i*ceil((double)MG/P[1]); 
 +        oK1[i]=i*ceil((double)KG/P[1]); 
 +        #else
 +        oN1[i]=(NG*i)/P[1]; 
 +        oM1[i]=(MG*i)/P[1]; 
 +        oK1[i]=(KG*i)/P[1]; 
 +        #endif
 +    }
 +    for (i=0;i<P[0]-1;i++) 
 +    {
 +        N0[i]=oN0[i+1]-oN0[i];
 +        M0[i]=oM0[i+1]-oM0[i];
 +        K0[i]=oK0[i+1]-oK0[i];
 +    }
 +    N0[P[0]-1]=NG-oN0[P[0]-1];
 +    M0[P[0]-1]=MG-oM0[P[0]-1];
 +    K0[P[0]-1]=KG-oK0[P[0]-1];
 +    for (i=0;i<P[1]-1;i++) 
 +    {
 +        N1[i]=oN1[i+1]-oN1[i];
 +        M1[i]=oM1[i+1]-oM1[i];
 +        K1[i]=oK1[i+1]-oK1[i];
 +    }
 +    N1[P[1]-1]=NG-oN1[P[1]-1];
 +    M1[P[1]-1]=MG-oM1[P[1]-1];
 +    K1[P[1]-1]=KG-oK1[P[1]-1];
 +
 +    /* for step 1-3 the local N,M,K sizes of the transposed system
 +       C: contiguous dimension, and nP: number of processor in subcommunicator 
 +       for that step */
 +    
 +    
 +    pM[0] = M0[prank[0]];
 +    oM[0] = oM0[prank[0]];
 +    pK[0] = K1[prank[1]];
 +    oK[0] = oK1[prank[1]];
 +    C[0] = NG;
 +    rC[0] = rNG;
 +    if (!(flags&FFT5D_ORDER_YZ)) {
 +        N[0] = vmax(N1,P[1]);
 +        M[0] = M0[prank[0]];
 +        K[0] = vmax(K1,P[1]);
 +        pN[0] = N1[prank[1]];
 +        iNout[0] = N1;
 +        oNout[0] = oN1;
 +        nP[0] = P[1];
 +        C[1] = KG;
 +        rC[1] =rKG;
 +        N[1] = vmax(K0,P[0]);
 +        pN[1] = K0[prank[0]];
 +        iNin[1] = K1;
 +        oNin[1] = oK1; 
 +        iNout[1] = K0;
 +        oNout[1] = oK0;
 +        M[1] = vmax(M0,P[0]);
 +        pM[1] = M0[prank[0]];
 +        oM[1] = oM0[prank[0]];
 +        K[1] = N1[prank[1]];
 +        pK[1] = N1[prank[1]];
 +        oK[1] = oN1[prank[1]];
 +        nP[1] = P[0];
 +        C[2] = MG;
 +        rC[2] = rMG;
 +        iNin[2] = M0;
 +        oNin[2] = oM0;
 +        M[2] = vmax(K0,P[0]);
 +        pM[2] = K0[prank[0]];
 +        oM[2] = oK0[prank[0]];
 +        K[2] = vmax(N1,P[1]);
 +        pK[2] = N1[prank[1]];
 +        oK[2] = oN1[prank[1]];
 +        free(N0); free(oN0); /*these are not used for this order*/
 +        free(M1); free(oM1); /*the rest is freed in destroy*/
 +    } else {
 +        N[0] = vmax(N0,P[0]);
 +        M[0] = vmax(M0,P[0]);
 +        K[0] = K1[prank[1]];
 +        pN[0] = N0[prank[0]];
 +        iNout[0] = N0;
 +        oNout[0] = oN0;
 +        nP[0] = P[0];
 +        C[1] = MG;
 +        rC[1] =rMG;
 +        N[1] = vmax(M1,P[1]);
 +        pN[1] = M1[prank[1]];
 +        iNin[1] = M0;
 +        oNin[1] = oM0;
 +        iNout[1] = M1;
 +        oNout[1] = oM1;
 +        M[1] = N0[prank[0]];
 +        pM[1] = N0[prank[0]];
 +        oM[1] = oN0[prank[0]];
 +        K[1] = vmax(K1,P[1]);
 +        pK[1] = K1[prank[1]];
 +        oK[1] = oK1[prank[1]];
 +        nP[1] = P[1];
 +        C[2] = KG;
 +        rC[2] = rKG;
 +        iNin[2] = K1;
 +        oNin[2] = oK1;
 +        M[2] = vmax(N0,P[0]);
 +        pM[2] = N0[prank[0]];
 +        oM[2] = oN0[prank[0]];
 +        K[2] = vmax(M1,P[1]);
 +        pK[2] = M1[prank[1]];
 +        oK[2] = oM1[prank[1]];
 +        free(N1); free(oN1); /*these are not used for this order*/
 +        free(K0); free(oK0); /*the rest is freed in destroy*/
 +    }
 +    N[2]=pN[2]=-1; /*not used*/
 +    
 +    /*
 +      Difference between x-y-z regarding 2d decomposition is whether they are 
 +      distributed along axis 1, 2 or both 
 +    */
 +    
 +    /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 +    lsize = std::max(N[0]*M[0]*K[0]*nP[0],std::max(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 +    /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 +    if (!(flags&FFT5D_NOMALLOC)) { 
 +        snew_aligned(lin, lsize, 32);
 +        snew_aligned(lout, lsize, 32);
 +        if (nthreads > 1)
 +        {
 +            /* We need extra transpose buffers to avoid OpenMP barriers */
 +            snew_aligned(lout2, lsize, 32);
 +            snew_aligned(lout3, lsize, 32);
 +        }
 +        else
 +        {
 +            /* We can reuse the buffers to avoid cache misses */
 +            lout2 = lin;
 +            lout3 = lout;
 +        }
 +    } else {
 +        lin = *rlin;
 +        lout = *rlout;
 +        if (nthreads > 1)
 +        {
 +            lout2 = *rlout2;
 +            lout3 = *rlout3;
 +        }
 +        else
 +        {
 +            lout2 = lin;
 +            lout3 = lout;
 +        }
 +    }
 +
 +    plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 +
 +    
 +    if (debug)
 +    {
 +        fprintf(debug, "Running on %d threads\n",nthreads);        
 +    }
 +
 +#ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 +    /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 +     * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 +     * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 +     * and that the 3d plan is faster than the 1d plan.
 +     */
 +    if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 +            int fftwflags=FFTW_DESTROY_INPUT;
 +            FFTW(iodim) dims[3];
 +            int inNG=NG,outMG=MG,outKG=KG;
 +
 +            FFTW_LOCK;
 +            if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 +            if (flags&FFT5D_REALCOMPLEX) {
 +                if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 +                    inNG*=2; 
 +                } else {                        /*output pointer is not complex*/
 +                    if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 +                    else outKG*=2;
 +                }
 +            }
 +
 +            if (!(flags&FFT5D_BACKWARD)) {
 +                dims[0].n  = KG;
 +                dims[1].n  = MG;
 +                dims[2].n  = rNG;
 +                
 +                dims[0].is = inNG*MG;     /*N M K*/
 +                dims[1].is = inNG;
 +                dims[2].is = 1;
 +                if (!(flags&FFT5D_ORDER_YZ)) {
 +                    dims[0].os = MG;       /*M K N*/
 +                    dims[1].os = 1;
 +                    dims[2].os = MG*KG;
 +                } else  {
 +                    dims[0].os = 1;       /*K N M*/
 +                    dims[1].os = KG*NG;
 +                    dims[2].os = KG;
 +                }
 +            } else {
 +                if (!(flags&FFT5D_ORDER_YZ)) {
 +                    dims[0].n  = NG;   
 +                    dims[1].n  = KG;   
 +                    dims[2].n  = rMG;  
 +                    
 +                    dims[0].is = 1;     
 +                    dims[1].is = NG*MG;
 +                    dims[2].is = NG;
 +
 +                    dims[0].os = outMG*KG;       
 +                    dims[1].os = outMG;
 +                    dims[2].os = 1;                  
 +                } else {
 +                    dims[0].n  = MG;
 +                    dims[1].n  = NG;
 +                    dims[2].n  = rKG;
 +                    
 +                    dims[0].is = NG;     
 +                    dims[1].is = 1;
 +                    dims[2].is = NG*MG;
 +
 +                    dims[0].os = outKG*NG;       
 +                    dims[1].os = outKG;
 +                    dims[2].os = 1;                  
 +                }           
 +            }
 +#ifdef FFT5D_THREADS
 +#ifdef FFT5D_FFTW_THREADS
 +            FFTW(plan_with_nthreads)(nthreads);
 +#endif
 +#endif
 +            if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 +                plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 +                                     /*howmany*/ 0, /*howmany_dims*/0 ,
 +                                     (real*)lin, (FFTW(complex) *)lout,
 +                                     /*flags*/ fftwflags);              
 +            } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 +                plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 +                                     /*howmany*/ 0, /*howmany_dims*/0 ,
 +                                     (FFTW(complex) *)lin, (real*)lout,
 +                                     /*flags*/ fftwflags);              
 +            } else {
 +                plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 +                                     /*howmany*/ 0, /*howmany_dims*/0 ,
 +                                     (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 +                                     /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 +            }
 +#ifdef FFT5D_THREADS
 +#ifdef FFT5D_FFTW_THREADS
 +            FFTW(plan_with_nthreads)(1);
 +#endif
 +#endif
 +            FFTW_UNLOCK;
 +    }
 +    if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 +#endif /* GMX_FFT_FFTW3 */
 +        for (s=0;s<3;s++) {
 +            if (debug)
 +            {
 +                fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 +                        s,rC[s],M[s],pK[s],C[s],lsize);
 +            }
 +            plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 +
 +            /* Make sure that the init routines are only called by one thread at a time and in order
 +               (later is only important to not confuse valgrind)
 +             */
 +#pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 +            for(t=0; t<nthreads; t++)
++            {    
 +#pragma omp ordered
 +            {
 +                int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 +
 +                if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 +                    gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 +                } else {
 +                    gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 +                }
 +            }
++            }
 +        }
 +
 +#ifdef GMX_FFT_FFTW3 
 +    }
 +#endif
 +    if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 +        plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 +    } else {
 +        plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 +    }
 +#ifdef FFT5D_MPI_TRANSPOSE
 +    FFTW_LOCK;
 +    for (s=0;s<2;s++) {
 +        if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ))) 
 +            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 +        else
 +            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 +    }
 +    FFTW_UNLOCK;
 +#endif 
 +
 +    
 +    plan->lin=lin;
 +    plan->lout=lout;
 +    plan->lout2=lout2;
 +    plan->lout3=lout3;
 +    
 +    plan->NG=NG;plan->MG=MG;plan->KG=KG;
 +    
 +    for (s=0;s<3;s++) {
 +        plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 +        plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 +        plan->C[s]=C[s];plan->rC[s]=rC[s];
 +        plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 +    }
 +    for (s=0;s<2;s++) {
 +        plan->P[s]=nP[s];plan->coor[s]=prank[s];
 +    }
 +    
 +/*    plan->fftorder=fftorder;
 +    plan->direction=direction;    
 +    plan->realcomplex=realcomplex;
 +*/
 +    plan->flags=flags;
 +    plan->nthreads=nthreads;
 +    *rlin=lin;
 +    *rlout=lout;
 +    *rlout2=lout2;
 +    *rlout3=lout3;
 +    return plan;
 +}
 +
 +
 +enum order {
 +    XYZ,
 +    XZY,
 +    YXZ,
 +    YZX,
 +    ZXY,
 +    ZYX
 +};
 +
 +
 +
 +/*here x,y,z and N,M,K is in rotated coordinate system!!
 +  x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 +  maxN,maxM,maxK is max size of local data
 +  pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 +  NG, MG, KG is size of global data*/
 +static void splitaxes(t_complex* lout,const t_complex* lin,
 +                      int maxN,int maxM,int maxK, int pN, int pM, int pK,
 +                      int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 +{
 +    int x,y,z,i;
 +    int in_i,out_i,in_z,out_z,in_y,out_y;
 +    int s_y,e_y;
 +
 +    for (z=startz; z<endz+1; z++) /*3. z l*/
 +    {
 +        if (z==startz) {
 +            s_y=starty;
 +        } else {
 +            s_y=0;
 +        }
 +        if (z==endz) {
 +            e_y=endy;
 +        } else {
 +            e_y=pM;
 +        }
 +        out_z  = z*maxN*maxM;
 +        in_z = z*NG*pM;
 +
 +        for (i=0; i<P; i++) /*index cube along long axis*/
 +        {
 +            out_i  = out_z  + i*maxN*maxM*maxK;
 +            in_i = in_z + oN[i];
 +            for (y=s_y;y<e_y;y++) { /*2. y k*/
 +                out_y  = out_i  + y*maxN;
 +                in_y = in_i + y*NG;
 +                for (x=0;x<N[i];x++) { /*1. x j*/
 +                    lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 +                    /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 +                    /*before split data contiguos - thus if different processor get different amount oN is different*/
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/*make axis contiguous again (after AllToAll) and also do local transpose*/
 +/*transpose mayor and major dimension
 +  variables see above
 +  the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 +  N,M,K local dimensions
 +  KG global size*/
 +static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 +                            int maxN,int maxM,int maxK,int pN, int pM, int pK, 
 +                            int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 +{
 +    int i,x,y,z;
 +    int out_i,in_i,out_x,in_x,out_z,in_z;
 +    int s_y,e_y;
 +
 +    for (x=startx;x<endx+1;x++) /*1.j*/
 +    {
 +        if (x==startx)
 +        {
 +            s_y=starty;
 +        }
 +        else
 +        {
 +            s_y=0;
 +        }
 +        if (x==endx)
 +        {
 +            e_y=endy;
 +        }
 +        else
 +        {
 +            e_y=pM;
 +        }
 +
 +        out_x  = x*KG*pM;
 +        in_x = x;
 +
 +        for (i=0;i<P;i++) /*index cube along long axis*/
 +        {
 +            out_i  = out_x  + oK[i];
 +            in_i = in_x + i*maxM*maxN*maxK;
 +            for (z=0;z<K[i];z++) /*3.l*/
 +            {
 +                out_z  = out_i  + z;
 +                in_z = in_i + z*maxM*maxN;
 +                for (y=s_y;y<e_y;y++) /*2.k*/
 +                {
 +                    lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/*make axis contiguous again (after AllToAll) and also do local transpose
 +  tranpose mayor and middle dimension
 +  variables see above
 +  the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 +  N,M,K local size
 +  MG, global size*/
 +static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 +                            int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 +    int i,z,y,x;
 +    int out_i,in_i,out_z,in_z,out_x,in_x;
 +    int s_x,e_x;
 +
 +    for (z=startz; z<endz+1; z++)
 +    {
 +        if (z==startz)
 +        {
 +            s_x=startx;
 +        }
 +        else
 +        {
 +            s_x=0;
 +        }
 +        if (z==endz)
 +        {
 +            e_x=endx;
 +        }
 +        else
 +        {
 +            e_x=pN;
 +        }
 +        out_z  = z*MG*pN;
 +        in_z = z*maxM*maxN;
 +
 +        for (i=0; i<P; i++) /*index cube along long axis*/
 +        {
 +            out_i  = out_z  + oM[i];
 +            in_i = in_z + i*maxM*maxN*maxK;
 +            for (x=s_x;x<e_x;x++)
 +            {
 +                out_x  = out_i  + x*MG;
 +                in_x = in_i + x;
 +                for (y=0;y<M[i];y++)
 +                {
 +                    lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void rotate_offsets(int x[]) {
 +    int t=x[0];
 +/*    x[0]=x[2];
 +    x[2]=x[1];
 +    x[1]=t;*/
 +    x[0]=x[1];
 +    x[1]=x[2];
 +    x[2]=t;
 +}
 +
 +/*compute the offset to compare or print transposed local data in original input coordinates
 +  xs matrix dimension size, xl dimension length, xc decomposition offset 
 +  s: step in computation = number of transposes*/
 +static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 +/*    int direction = plan->direction;
 +    int fftorder = plan->fftorder;*/
 +    
 +    int o=0;
 +    int pos[3],i;
 +    int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 +        *C=plan->C, *rC=plan->rC;
 +
 +    NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 +
 +    if (!(plan->flags&FFT5D_ORDER_YZ)) {
 +        switch (s) {
 +        case 0: o=XYZ; break;
 +        case 1: o=ZYX; break;
 +        case 2: o=YZX; break;
 +        default: assert(0);
 +        }
 +    } else {
 +        switch (s) {
 +        case 0: o=XYZ; break;
 +        case 1: o=YXZ; break;
 +        case 2: o=ZXY; break;
 +        default: assert(0);
 +        }
 +    }
 + 
 +    switch (o) {
 +        case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 +        case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 +        case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 +        case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 +        case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 +        case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 +    }
 +    /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 +        
 +    /*xs, xl give dimension size and data length in local transposed coordinate system
 +      for 0(/1/2): x(/y/z) in original coordinate system*/
 +    for (i=0;i<3;i++) {
 +        switch (pos[i]) {
 +        case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 +        case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 +        case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 +        }
 +    }
 +    /*input order is different for test program to match FFTW order 
 +      (important for complex to real)*/
 +    if (plan->flags&FFT5D_BACKWARD) {
 +        rotate_offsets(xs);
 +        rotate_offsets(xl);
 +        rotate_offsets(xc);
 +        rotate_offsets(NG);
 +        if (plan->flags&FFT5D_ORDER_YZ) {
 +            rotate_offsets(xs);
 +            rotate_offsets(xl);
 +            rotate_offsets(xc);
 +            rotate_offsets(NG);
 +        }
 +    }
 +    if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 +        xl[0] = rC[s];
 +    }
 +}
 +
 +static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 +    int x,y,z,l;
 +    int *coor = plan->coor;
 +    int xs[3],xl[3],xc[3],NG[3];        
 +    int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 +    compute_offsets(plan,xs,xl,xc,NG,s);
 +    fprintf(debug,txt,coor[0],coor[1],s);
 +    /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 +    for(z=0;z<xl[2];z++) {
 +        for(y=0;y<xl[1];y++) {
 +            fprintf(debug,"%d %d: ",coor[0],coor[1]);
 +            for (x=0;x<xl[0];x++) {
 +                for (l=0;l<ll;l++) {
 +                    fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 +                }
 +                fprintf(debug,",");
 +            }
 +            fprintf(debug,"\n");
 +        }
 +    }
 +}
 +
 +void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 +    t_complex *lin = plan->lin;
 +    t_complex *lout = plan->lout;
 +    t_complex *lout2 = plan->lout2;
 +    t_complex *lout3 = plan->lout3;
 +    t_complex *fftout,*joinin;
 +
 +    gmx_fft_t **p1d=plan->p1d;
 +#ifdef FFT5D_MPI_TRANSPOSE
 +    FFTW(plan) *mpip=plan->mpip;
 +#endif
 +#ifdef GMX_MPI
 +    MPI_Comm *cart=plan->cart;
 +#endif
 +#ifdef NOGMX
 +    double time_fft=0,time_local=0,time_mpi[2]={0},time=0;    
 +#endif
 +    int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 +        *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 +    int s=0,tstart,tend,bParallelDim;
 +    
 +    
 +#ifdef GMX_FFT_FFTW3 
 +    if (plan->p3d)
 +    {
 +        if (thread == 0)
 +        {
 +#ifdef NOGMX
 +            if (times!=0)
 +            {
 +                time=MPI_Wtime();
 +            }
 +#endif
 +            FFTW(execute)(plan->p3d);
 +#ifdef NOGMX
 +            if (times!=0)
 +            {
 +                times->fft+=MPI_Wtime()-time;
 +            }
 +#endif
 +        }
 +        return;
 +    }
 +#endif
 +
 +        s=0;
 +
 +    /*lin: x,y,z*/
 +        if (plan->flags&FFT5D_DEBUG && thread == 0)
 +        {
 +            print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 +        }
 +
 +        for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 +
 +#ifdef GMX_MPI
 +        if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 +        {
 +            bParallelDim = 1;
 +        }
 +        else
 +#endif
 +        {
 +            bParallelDim = 0;
 +        }
 +
 +        /* ---------- START FFT ------------ */
 +#ifdef NOGMX
 +        if (times!=0 && thread == 0)
 +        {
 +            time=MPI_Wtime();
 +        }
 +#endif
 +
 +        if (bParallelDim || plan->nthreads == 1) {
 +            fftout = lout;
 +        }
 +        else
 +        {
 +            if (s==0)
 +            {
 +                fftout = lout3;
 +            } else
 +            {
 +                fftout = lout2;
 +            }
 +        }
 +
 +        tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 +        if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 +        {
 +            gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 +        } else
 +        {
 +            gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 +
 +        }
 +
 +#ifdef NOGMX
 +        if (times != NULL && thread == 0)
 +        {
 +            time_fft+=MPI_Wtime()-time;
 +        }
 +#endif
 +        if (plan->flags&FFT5D_DEBUG && thread == 0)
 +        {
 +            print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 +        }
 +        /* ---------- END FFT ------------ */
 +
 +        /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 +        if (bParallelDim) {
 +#ifdef NOGMX
 +            if (times != NULL && thread == 0)
 +            {
 +                time=MPI_Wtime();
 +            }
 +#endif
 +            /*prepare for A
 +llToAll
 +              1. (most outer) axes (x) is split into P[s] parts of size N[s]
 +              for sending*/
 +            if (pM[s]>0)
 +            {
 +                tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 +                tstart/=C[s];
 +                splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 +            }
 +#pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 +#ifdef NOGMX
 +            if (times != NULL && thread == 0)
 +            {
 +                time_local+=MPI_Wtime()-time;
 +            }
 +#endif
 +
 +        /* ---------- END SPLIT , START TRANSPOSE------------ */
 +
 +            if (thread == 0)
 +            {
 +#ifdef NOGMX
 +                if (times!=0)
 +                {
 +                    time=MPI_Wtime();
 +                }
 +#else
 +                wallcycle_start(times,ewcPME_FFTCOMM);
 +#endif
 +#ifdef FFT5D_MPI_TRANSPOSE
 +                FFTW(execute)(mpip[s]);
 +#else
 +#ifdef GMX_MPI
 +                if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 +                    MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 +                else
 +                    MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 +#else
 +                gmx_incons("fft5d MPI call without MPI configuration");
 +#endif /*GMX_MPI*/
 +#endif /*FFT5D_MPI_TRANSPOSE*/
 +#ifdef NOGMX
 +                if (times!=0)
 +                {
 +                    time_mpi[s]=MPI_Wtime()-time;
 +                }
 +#else
 +                wallcycle_stop(times,ewcPME_FFTCOMM);
 +#endif
 +            }  /*master*/
 +        }  /* bPrallelDim */
 +#pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
 +
 +        /* ---------- END SPLIT + TRANSPOSE------------ */
 +
 +        /* ---------- START JOIN ------------ */
 +#ifdef NOGMX
 +        if (times != NULL && thread == 0)
 +        {
 +            time=MPI_Wtime();
 +        }
 +#endif
 +
 +        if (bParallelDim) {
 +            joinin = lout3;
 +        } else {
 +            joinin = fftout;
 +        }
 +        /*bring back in matrix form 
 +          thus make  new 1. axes contiguos
 +          also local transpose 1 and 2/3 
 +          runs on thread used for following FFT (thus needing a barrier before but not afterwards)
 +        */
 +        if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
 +            if (pM[s]>0)
 +            {
 +                tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
 +                tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
 +                joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 +            }
 +        }
 +        else {
 +            if (pN[s]>0)
 +            {
 +                tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
 +                tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
 +                joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
 +            }
 +        }
 +
 +#ifdef NOGMX
 +        if (times != NULL && thread == 0)
 +        {
 +            time_local+=MPI_Wtime()-time;
 +        }
 +#endif
 +        if (plan->flags&FFT5D_DEBUG && thread == 0)
 +        {
 +            print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
 +        }
 +        /* ---------- END JOIN ------------ */
 +
 +        /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
 +    }  /* for(s=0;s<2;s++) */
 +#ifdef NOGMX
 +        if (times != NULL && thread == 0)
 +        {
 +            time=MPI_Wtime();
 +        }
 +#endif
 +
 +    if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
 +
 +    /*  ----------- FFT ----------- */
 +    tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 +    if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
 +        gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
 +    } else {
 +        gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
 +    }
 +    /* ------------ END FFT ---------*/
 +
 +#ifdef NOGMX
 +    if (times != NULL && thread == 0)
 +    {
 +        time_fft+=MPI_Wtime()-time;
 +
 +        times->fft+=time_fft;
 +        times->local+=time_local;
 +        times->mpi2+=time_mpi[1];
 +        times->mpi1+=time_mpi[0];
 +    }
 +#endif
 +
 +    if (plan->flags&FFT5D_DEBUG && thread == 0)
 +    {
 +        print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 +    }
 +}
 +
 +void fft5d_destroy(fft5d_plan plan) {
 +    int s,t;
 +
 +    for (s=0;s<3;s++)
 +    {
 +        if (plan->p1d[s])
 +        {
 +            for (t=0;t<plan->nthreads;t++)
 +            {
 +                gmx_many_fft_destroy(plan->p1d[s][t]);
 +            }
 +            free(plan->p1d[s]);
 +        }
 +        if (plan->iNin[s])
 +        {
 +            free(plan->iNin[s]);
 +            plan->iNin[s]=0;
 +        }
 +        if (plan->oNin[s])
 +        {
 +            free(plan->oNin[s]);
 +            plan->oNin[s]=0;
 +        }
 +        if (plan->iNout[s])
 +        {
 +            free(plan->iNout[s]);
 +            plan->iNout[s]=0;
 +        }
 +        if (plan->oNout[s])
 +        {
 +            free(plan->oNout[s]);
 +            plan->oNout[s]=0;
 +        }
 +    }
 +#ifdef GMX_FFT_FFTW3 
 +    FFTW_LOCK;
 +#ifdef FFT5D_MPI_TRANSPOS
 +    for (s=0;s<2;s++)    
 +    {
 +        FFTW(destroy_plan)(plan->mpip[s]);
 +    }
 +#endif /* FFT5D_MPI_TRANSPOS */
 +    if (plan->p3d)
 +    {
 +        FFTW(destroy_plan)(plan->p3d);
 +    }
 +    FFTW_UNLOCK;
 +#endif /* GMX_FFT_FFTW3 */
 +
 +    if (!(plan->flags&FFT5D_NOMALLOC))
 +    {
 +        sfree_aligned(plan->lin);
 +        sfree_aligned(plan->lout);
 +        if (plan->nthreads > 1)
 +        {
 +            sfree_aligned(plan->lout2);
 +            sfree_aligned(plan->lout3);
 +        }
 +    }
 +    
 +#ifdef FFT5D_THREADS
 +#ifdef FFT5D_FFTW_THREADS
 +    /*FFTW(cleanup_threads)();*/
 +#endif
 +#endif
 +
 +    free(plan);
 +}
 +
 +/*Is this better than direct access of plan? enough data?
 +  here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
 +void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
 +    *N1=plan->N[0];
 +    *M0=plan->M[0];
 +    *K1=plan->K[0];
 +    *K0=plan->N[1];
 +    
 +    *coor=plan->coor;
 +}
 +
 +
 +/*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting 
 +  of processor dimensions*/
 +fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
 +    MPI_Comm cart[2]={0};
 +#ifdef GMX_MPI
 +    int size=1,prank=0;
 +    int P[2];
 +    int coor[2];
 +    int wrap[]={0,0};
 +    MPI_Comm gcart;
 +    int rdim1[] = {0,1}, rdim2[] = {1,0};
 +
 +    MPI_Comm_size(comm,&size);
 +    MPI_Comm_rank(comm,&prank);
 +
 +    if (P0==0) P0 = lfactor(size);
 +    if (size%P0!=0)
 +    {
 +        if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
 +        P0 = lfactor(size);
 +    }
 +        
 +    P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
 +    
 +    /*Difference between x-y-z regarding 2d decomposition is whether they are 
 +      distributed along axis 1, 2 or both*/
 +    
 +    MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
 +    MPI_Cart_get(gcart,2,P,wrap,coor); 
 +    MPI_Cart_sub(gcart, rdim1 , &cart[0]);
 +    MPI_Cart_sub(gcart, rdim2 , &cart[1]);
 +#endif
 +    return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
 +}
 +
 +
 +
 +/*prints in original coordinate system of data (as the input to FFT)*/
 +void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
 +    int xs[3],xl[3],xc[3],NG[3];
 +    int x,y,z,l;
 +    int *coor = plan->coor;
 +    int ll=2; /*compare ll values per element (has to be 2 for complex)*/
 +    if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
 +    {
 +        ll=1;
 +    }
 +
 +    compute_offsets(plan,xs,xl,xc,NG,2);
 +    if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
 +    for (z=0;z<xl[2];z++) {
 +        for(y=0;y<xl[1];y++) {
 +            if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
 +            for (x=0;x<xl[0];x++) {
 +                for (l=0;l<ll;l++) { /*loop over real/complex parts*/
 +                    real a,b;
 +                    a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
 +                    if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
 +                    if (!bothLocal) 
 +                        b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
 +                    else 
 +                        b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
 +                    if (plan->flags&FFT5D_DEBUG) {
 +                        printf("%f %f, ",a,b);
 +                    } else {
 +                        if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
 +                            printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
 +                        }
 +/*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
 +                    }
 +                }
 +                if (plan->flags&FFT5D_DEBUG) printf(",");
 +            }
 +            if (plan->flags&FFT5D_DEBUG) printf("\n");
 +        }
 +    }
 +    
 +}
 +
index 304ef6081ce987d45733b204f62692b7f08474f8,0000000000000000000000000000000000000000..934a8d939c922b27f819439794b603ee93599d87
mode 100644,000000..100644
--- /dev/null
@@@ -1,2748 -1,0 +1,2755 @@@
-     *kernel_type = nbk4x4_PlainC;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +  t_forcerec *fr;
 +  
 +  snew(fr,1);
 +  
 +  return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp,real *nbfp,gmx_bool bBHAM,int atnr)
 +{
 +  int i,j;
 +  
 +  for(i=0; (i<atnr); i++) {
 +    for(j=0; (j<atnr); j++) {
 +      fprintf(fp,"%2d - %2d",i,j);
 +      if (bBHAM)
 +      fprintf(fp,"  a=%10g, b=%10g, c=%10g\n",BHAMA(nbfp,atnr,i,j),
 +              BHAMB(nbfp,atnr,i,j),BHAMC(nbfp,atnr,i,j)/6.0);
 +      else
 +      fprintf(fp,"  c6=%10g, c12=%10g\n",C6(nbfp,atnr,i,j)/6.0,
 +            C12(nbfp,atnr,i,j)/12.0);
 +    }
 +  }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef,gmx_bool bBHAM)
 +{
 +  real *nbfp;
 +  int  i,j,k,atnr;
 +  
 +  atnr=idef->atnr;
 +  if (bBHAM) {
 +    snew(nbfp,3*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          BHAMA(nbfp,atnr,i,j) = idef->iparams[k].bham.a;
 +          BHAMB(nbfp,atnr,i,j) = idef->iparams[k].bham.b;
 +          /* nbfp now includes the 6.0 derivative prefactor */
 +          BHAMC(nbfp,atnr,i,j) = idef->iparams[k].bham.c*6.0;
 +      }
 +    }
 +  }
 +  else {
 +    snew(nbfp,2*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +          C6(nbfp,atnr,i,j)   = idef->iparams[k].lj.c6*6.0;
 +          C12(nbfp,atnr,i,j)  = idef->iparams[k].lj.c12*12.0;
 +      }
 +    }
 +  }
 +
 +  return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the 
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in 
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array. 
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + * 
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm 
 + */
 +
 +typedef struct 
 +{
 +    int    model;          
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t   *molt,
 +                 int                   cg0,
 +                 int                   nmol,
 +                 const unsigned char   *qm_grpnr,
 +                 const t_grps          *qm_grps,
 +                 t_forcerec *          fr,
 +                 int                   *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                   cginfo,
 +                 int                   *cg_sp)
 +{
 +    const t_blocka *  excl;
 +    t_atom            *atom;
 +    int               j,k;
 +    int               j0,j1,nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real              tmp_charge[4];
 +    int               tmp_vdwtype[4];
 +    int               tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type. 
 +     * Every time we discover a new molecule that fulfills the basic 
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */   
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +    
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +    
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug) {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name,nj);
 +    }
 +    
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj<3 || nj>4)
 +    {
 +        return;
 +    }
 +    
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE; 
 +    if (qm_grpnr != NULL)
 +    {
 +        for(j=j0 ; j<j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +    
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +    
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */   
 +    perturbed = FALSE; 
 +    
 +    for(j=j0; j<j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +    
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +    
 +    /* Now it's only a question if the VdW and charge parameters 
 +     * are OK. Before doing the check we compare and see if they are 
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.    
 +     */
 +    for(j=0; j<nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    } 
 +    
 +    /* Does it match any previous solvent type? */
 +    for(k=0 ; k<*n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +        
 +        
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if( (solvent_parameters[k].model==esolSPC   && nj!=3)  ||
 +            (solvent_parameters[k].model==esolTIP4P && nj!=4) )
 +            match = FALSE;
 +        
 +        /* Check that types & charges match for all atoms in molecule */
 +        for(j=0 ; j<nj && match==TRUE; j++)
 +        {                     
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if(tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +    
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.   
 +     */
 +    for(j=0; j<nj; j++) 
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +        
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */   
 +        for(k=0; k<fr->ntype && (has_vdw[j]==FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */ 
 +            if (fr->bBHAM) 
 +            {
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (BHAMA(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMB(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMC(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (C6(fr->nbfp,fr->ntype,tjA,k)  != 0.0) ||
 +                              (C12(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +        }
 +    }
 +    
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge, 
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 might have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<3;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj==4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1. 
 +         * Only atom 1 mght have VdW.
 +         */
 +        if(has_vdw[1] == FALSE &&
 +           has_vdw[2] == FALSE &&
 +           has_vdw[3] == FALSE &&
 +           tmp_charge[0]  == 0 &&
 +           tmp_charge[1]  != 0 &&
 +           tmp_charge[2]  == tmp_charge[1] &&
 +           tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<4;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +            
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE *                fp,
 +              const gmx_mtop_t *    mtop,
 +              t_forcerec *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block *   cgs;
 +    const t_block *   mols;
 +    const gmx_moltype_t *molt;
 +    int               mb,mol,cg_mol,at_offset,cg_offset,am,cgm,i,nmol_ch,nmol;
 +    int               n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int               **cg_sp;
 +    int               bestsp,bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp,mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb],cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for(mol=0; mol<nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for(cg_mol=0; cg_mol<cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt,cg_mol,nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters,&solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */   
 +    
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for(i=0;i<n_solvent_parameters;i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +    
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +    
 +#ifdef DISABLE_WATER_NLIST
 +      bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for(i=0; i<cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +    
 +    if (bestsol != esolNO && fp!=NULL)
 +    {
 +        fprintf(fp,"\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum { acNONE=0, acCONSTRAINT, acSETTLE };
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr,gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block *cgs;
 +    const t_blocka *excl;
 +    const gmx_moltype_t *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool *type_VDW;
 +    int  *cginfo;
 +    int  cg_offset,a_offset,cgm,am;
 +    int  mb,m,ncg_tot,cg,a0,a1,gid,ai,j,aj,excl_nalloc;
 +    int  *a_con;
 +    int  ftype;
 +    int  ia;
 +    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter,bHaveVDW,bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb,mtop->nmolblock);
 +
 +    snew(type_VDW,fr->ntype);
 +    for(ai=0; ai<fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for(j=0; j<fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp,fr->ntype,ai,j) != 0 ||
 +                C12(fr->nbfp,fr->ntype,ai,j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl,excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am = 0;
 +        for(m=0; m<molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups,egcENER,a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups,egcENER,a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for(ai=a0; ai<a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo,cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con,molt->atoms.nr);
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for(ia=0; ia<molt->ilist[ftype].nr; ia+=1+nral)
 +                {
 +                    int a;
 +
 +                    for(a=0; a<nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for(m=0; m<(bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups,egcENER,a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg],gid);
 +                
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc) {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl,excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for(ai=a0; ai<a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for(j=excl->index[ai]; j<excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                    case acCONSTRAINT:
 +                        SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                        break;
 +                    case acSETTLE:
 +                        SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                        break;
 +                    default:
 +                        break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS,"A charge group has size %d which is larger than the limit of %d atoms",a1-a0,MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg],a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +    
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog,mtop,fr,cginfo_mb);
 +    
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            for(cg=0; cg<cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg],esolNO);
 +            }
 +        }
 +    }
 +    
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb,cginfo_mb_t *cgi_mb)
 +{
 +    int ncg,mb,cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo,ncg);
 +    mb = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    double qsum,q2sum,q;
 +    int    mb,nmol,i;
 +    const t_atoms *atoms;
 +    
 +    qsum  = 0;
 +    q2sum = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for(i=0; i<atoms->nr; i++)
 +        {
 +            q = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for(i=0; i<atoms->nr; i++)
 +            {
 +                q = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log) {
 +        if (fr->efep == efepNO)
 +            fprintf(log,"System total charge: %.3f\n",fr->qsum[0]);
 +        else
 +            fprintf(log,"System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0],fr->qsum[1]);
 +    }
 +}
 +
 +void update_forcerec(FILE *log,t_forcerec *fr,matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *atoms,*atoms_tpi;
 +    const t_blocka *excl;
 +    int    mb,nmol,nmolc,i,j,tpi,tpj,j1,j2,k,n,nexcl,q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)    
 +    long long int  npair,npair_ij,tmpi,tmpj;
 +#else
 +    double npair, npair_ij,tmpi,tmpj;
 +#endif
 +    double csix,ctwelve;
 +    int    ntp,*typecount;
 +    gmx_bool   bBHAM;
 +    real   *nbfp;
 +
 +    ntp = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp = fr->nbfp;
 +    
 +    for(q=0; q<(fr->efep==efepNO ? 1 : 2); q++) {
 +        csix = 0;
 +        ctwelve = 0;
 +        npair = 0;
 +        nexcl = 0;
 +        if (!fr->n_tpi) {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount,ntp);
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(i=0; i<atoms->nr; i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for(tpi=0; tpi<ntp; tpi++) {
 +                for(tpj=tpi; tpj<ntp; tpj++) {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM) {
 +                        /* nbfp now includes the 6.0 derivative prefactor */
 +                        csix    += npair_ij*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                    } else {
 +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                        csix    += npair_ij*   C6(nbfp,ntp,tpi,tpj)/6.0;
 +                        ctwelve += npair_ij*  C12(nbfp,ntp,tpi,tpj)/12.0;
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for(i=0; (i<atoms->nr); i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for(j=j1; j<j2; j++) {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM) {
 +                                /* nbfp now includes the 6.0 derivative prefactor */
 +                               csix -= nmol*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                            } else {
 +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                                csix    -= nmol*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                                ctwelve -= nmol*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        } else {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(j=0; j<atoms->nr; j++) {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +                        
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS,"Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for(i=0; i<fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            /* nbfp now includes the 6.0 derivative prefactor */
 +                            csix    += nmolc*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                        }
 +                        else
 +                        {
 +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                            csix    += nmolc*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                            ctwelve += nmolc*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog) {
 +            fprintf(fplog,"\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        } else {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug) {
 +            fprintf(debug,"Counted %d exclusions\n",nexcl);
 +            fprintf(debug,"Average C6 parameter is: %10g\n",(double)csix);
 +            fprintf(debug,"Average C12 parameter is: %10g\n",(double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0],fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e\n",fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog,t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1,*at2;
 +    int  mt1,mt2,i,j,tpi,tpj,ntypes;
 +    real b,bmin;
 +    real *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +    
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for(mt1=0; mt1<mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for(i=0; (i<at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +                gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",i,tpi,ntypes);
 +            
 +            for(mt2=mt1; mt2<mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for(j=0; (j<at2->nr); j++) {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",j,tpj,ntypes);
 +                    }
 +                    b = BHAMB(nbfp,ntypes,tpi,tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin==-1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Buckingham b parameters, min: %g, max: %g\n",
 +                bmin,fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,real rtab,
 +                            const t_commrec *cr,
 +                            const char *tabfn,char *eg1,char *eg2,
 +                            t_nblists *nbl)
 +{
 +    char buf[STRLEN];
 +    int i,j;
 +
 +    if (tabfn == NULL) {
 +        if (debug)
 +            fprintf(debug,"No table file name passed, can not read table, can not do non-bonded interactions\n");
 +        return;
 +    }
 +
 +    sprintf(buf,"%s",tabfn);
 +    if (eg1 && eg2)
 +    /* Append the two energy group names */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"_%s_%s.%s",
 +                eg1,eg2,ftp2ext(efXVG));
 +    nbl->table_elec_vdw = make_tables(fp,oenv,fr,MASTER(cr),buf,rtab,0);
 +    /* Copy the contents of the table to separate coulomb and LJ tables too,
 +     * to improve cache performance.
 +     */
 +    /* For performance reasons we want
 +     * the table data to be aligned to 16-byte. The pointers could be freed
 +     * but currently aren't.
 +     */
 +    nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
 +    nbl->table_elec.format = nbl->table_elec_vdw.format;
 +    nbl->table_elec.r = nbl->table_elec_vdw.r;
 +    nbl->table_elec.n = nbl->table_elec_vdw.n;
 +    nbl->table_elec.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_elec.ninteractions = 1;
 +    nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
 +    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
 +
 +    nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
 +    nbl->table_vdw.format = nbl->table_elec_vdw.format;
 +    nbl->table_vdw.r = nbl->table_elec_vdw.r;
 +    nbl->table_vdw.n = nbl->table_elec_vdw.n;
 +    nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_vdw.ninteractions = 2;
 +    nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
 +    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
 +
 +    for(i=0; i<=nbl->table_elec_vdw.n; i++)
 +    {
 +        for(j=0; j<4; j++)
 +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
 +        for(j=0; j<8; j++)
 +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
 +    }
 +}
 +
 +static void count_tables(int ftype1,int ftype2,const gmx_mtop_t *mtop,
 +                         int *ncount,int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist *il;
 +    int mt,ftype,stride,i,j,tabnr;
 +    
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2) {
 +                il = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for(i=0; i<il->nr; i+=stride) {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                        gmx_fatal(FARGS,"A bonded table number is smaller than 0: %d\n",tabnr);
 +                    if (tabnr >= *ncount) {
 +                        srenew(*count,tabnr+1);
 +                        for(j=*ncount; j<tabnr+1; j++)
 +                            (*count)[j] = 0;
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1,int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn,const char *tabext)
 +{
 +    int  i,ncount,*count;
 +    char tabfn[STRLEN];
 +    bondedtable_t *tab;
 +    
 +    tab = NULL;
 +    
 +    ncount = 0;
 +    count = NULL;
 +    count_tables(ftype1,ftype2,mtop,&ncount,&count);
 +    
 +    if (ncount > 0) {
 +        snew(tab,ncount);
 +        for(i=0; i<ncount; i++) {
 +            if (count[i] > 0) {
 +                sprintf(tabfn,"%s",basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1,"_%s%d.%s",
 +                        tabext,i,ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog,tabfn,NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +  
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home,int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr,int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin,fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc,fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,const t_inputrec *ir,
 +                          const char *tabfn, const gmx_mtop_t *mtop,
 +                            matrix     box)
 +{
 +  char buf[STRLEN];
 +  int i,j;
 +
 +  if (tabfn == NULL) {
 +        gmx_fatal(FARGS,"No thermoforce table file given. Use -tabletf to specify a file\n");
 +    return;
 +  }
 +
 +  snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +  for (i=0; i<ir->adress->n_tf_grps; i++){
 +    j = ir->adress->tf_table_index[i]; /* get energy group index */
 +    sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"tf_%s.%s",
 +        *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]) ,ftp2ext(efXVG));
 +    printf("loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[j], buf);
 +    fr->atf_tabs[i] = make_atf_table(fp,oenv,fr,buf, box);
 +  }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                      gmx_bool bPrintNote,t_commrec *cr,FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +         ir->rlist==0            &&
 +         ir->rcoulomb==0         &&
 +         ir->rvdw==0             &&
 +         ir->ePBC==epbcNONE      &&
 +         ir->vdwtype==evdwCUT    &&
 +         ir->coulombtype==eelCUT &&
 +         ir->efep==efepNO        &&
 +         (ir->implicit_solvent == eisNO || 
 +          (ir->implicit_solvent==eisGBSA && (ir->gb_algorithm==egbSTILL || 
 +                                             ir->gb_algorithm==egbHCT   || 
 +                                             ir->gb_algorithm==egbOBC))) &&
 +         getenv("GMX_NO_ALLVSALL") == NULL
 +            );
 +    
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note="NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,"\n%s\n",note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp,"\n%s\n",note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if(bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp,"\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +    
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 +{
 +    int t,i;
 +
 +    /* These thread local data structures are used for bondeds only */
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t,fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for(t=1; t<fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift,SHIFTS);
 +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 +            for(i=0; i<egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i],fr->f_t[t].grpp.nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE *fp,
 +                                  const t_commrec *cr,
 +                                  const gmx_cpuid_t cpuid_info,
++                                  const t_inputrec *ir,
 +                                  int *kernel_type,
 +                                  int *ewald_excl)
 +{
- #ifdef GMX_X86_SSE2
++    *kernel_type = nbnxnk4x4_PlainC;
 +    *ewald_excl  = ewaldexclTable;
 +
-         /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
-          * On AMD Bulldozer AVX-256 is much slower than AVX-128.
-          */
-         if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
-            gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
-         {
- #ifdef GMX_X86_AVX_256
-             *kernel_type = nbk4xN_X86_SIMD256;
- #else
-             *kernel_type = nbk4xN_X86_SIMD128;
++#ifdef GMX_NBNXN_SIMD
 +    {
-         }
-         else
++#ifdef GMX_NBNXN_SIMD_4XN
++        *kernel_type = nbnxnk4xN_SIMD_4xN;
 +#endif
-             *kernel_type = nbk4xN_X86_SIMD128;
++#ifdef GMX_NBNXN_SIMD_2XNN
++        /* We expect the 2xNN kernels to be faster in most cases */
++        *kernel_type = nbnxnk4xN_SIMD_2xNN;
++#endif
++
++#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
++        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
 +        {
-         if (getenv("GMX_NBNXN_AVX128") != NULL)
++            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
++             * 10% with HT, 50% without HT, but extra zeros interactions
++             * can compensate. As we currently don't detect the actual use
++             * of HT, switch to 4x8 to avoid a potential performance hit.
++             */
++            *kernel_type = nbnxnk4xN_SIMD_4xN;
 +        }
-             *kernel_type = nbk4xN_X86_SIMD128;
++#endif
++        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
 +        {
-         if (getenv("GMX_NBNXN_AVX256") != NULL)
++#ifdef GMX_NBNXN_SIMD_2XNN
++            *kernel_type = nbnxnk4xN_SIMD_4xN;
++#else
++            gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
++#endif
 +        }
- #ifdef GMX_X86_AVX_256
-             *kernel_type = nbk4xN_X86_SIMD256;
++        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
 +        {
-             gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
++#ifdef GMX_NBNXN_SIMD_2XNN
++            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 +#else
-     *kernel_type = nbkNotSet;
++            gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 +#endif
 +        }
 +
 +        /* Analytical Ewald exclusion correction is only an option in the
 +         * x86 SIMD kernel. This is faster in single precision
 +         * on Bulldozer and slightly faster on Sandy Bridge.
 +         */
 +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
 +        *ewald_excl = ewaldexclAnalytical;
 +#endif
 +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
 +        {
 +            *ewald_excl = ewaldexclTable;
 +        }
 +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
 +        {
 +            *ewald_excl = ewaldexclAnalytical;
 +        }
 +
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +/* Note that _mm_... intrinsics can be converted to either SSE or AVX
 + * depending on compiler flags.
 + * For gcc we check for __AVX__
 + * At least a check for icc should be added (if there is a macro)
 + */
 +static const char *nbk_name[] =
 +  { "not set", "plain C 4x4",
 +#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +#ifndef GMX_DOUBLE
 +    "SSE2 4x4",
 +#else
 +    "SSE2 4x2",
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "SSE4.1 4x4",
 +#else
 +    "SSE4.1 4x2",
 +#endif
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "AVX-128 4x4",
 +#else
 +    "AVX-128 4x2",
 +#endif
 +#endif
 +#ifndef GMX_DOUBLE
 +    "AVX-256 4x8",
 +#else
 +    "AVX-256 4x4",
 +#endif
 +    "CUDA 8x8x8", "plain C 8x8x8" };
 +
 +static void pick_nbnxn_kernel(FILE *fp,
 +                              const t_commrec *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool use_cpu_acceleration,
 +                              gmx_bool *bUseGPU,
++                              const t_inputrec *ir,
 +                              int *kernel_type,
 +                              int *ewald_excl,
 +                              gmx_bool bDoNonbonded)
 +{
 +    gmx_bool bEmulateGPU, bGPU, bEmulateGPUEnvVarSet;
 +    char gpu_err_str[STRLEN];
 +
 +    assert(kernel_type);
 +
-         *kernel_type = nbk8x8x8_PlainC;
++    *kernel_type = nbnxnkNotSet;
 +    *ewald_excl  = ewaldexclTable;
 +
 +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
 +    bGPU = ((bUseGPU != NULL) && hwinfo->bCanUseGPU);
 +
 +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. We will
 +     * automatically switch to emulation if non-bonded calculations are
 +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
 +     * way to turn off GPU initialization, data movement, and cleanup. */
 +    bEmulateGPU = (bEmulateGPUEnvVarSet || (!bDoNonbonded && bGPU));
 +
 +    /* Enable GPU mode when GPUs are available or GPU emulation is requested.
 +     * The latter is useful to assess the performance one can expect by adding
 +     * GPU(s) to the machine. The conditional below allows this even if mdrun
 +     * is compiled without GPU acceleration support.
 +     * Note that such a GPU acceleration performance assessment should be
 +     * carried out by setting the GMX_EMULATE_GPU and GMX_NO_NONBONDED env. vars
 +     * (and freezing the system as otherwise it would explode). */
 +    if (bGPU || bEmulateGPUEnvVarSet)
 +    {
 +        if (bEmulateGPU)
 +        {
 +            bGPU = FALSE;
 +        }
 +        else
 +        {
 +            /* Each PP node will use the intra-node id-th device from the
 +             * list of detected/selected GPUs. */
 +            if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
 +            {
 +                /* At this point the init should never fail as we made sure that
 +                 * we have all the GPUs we need. If it still does, we'll bail. */
 +                gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
 +                          cr->nodeid,
 +                          get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
 +                          gpu_err_str);
 +            }
 +        }
 +        *bUseGPU = bGPU;
 +    }
 +
 +    if (bEmulateGPU)
 +    {
-         *kernel_type = nbk8x8x8_CUDA;
++        *kernel_type = nbnxnk8x8x8_PlainC;
 +
 +        if (bDoNonbonded)
 +        {
 +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +        }
 +    }
 +    else if (bGPU)
 +    {
-     if (*kernel_type == nbkNotSet)
++        *kernel_type = nbnxnk8x8x8_CUDA;
 +    }
 +
-             pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
++    if (*kernel_type == nbnxnkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
-             *kernel_type = nbk4x4_PlainC;
++            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
 +                                  kernel_type,ewald_excl);
 +        }
 +        else
 +        {
-         if (MASTER(cr))
-         {
-             fprintf(stderr,"Using %s non-bonded kernels\n",
-                     nbk_name[*kernel_type]);
-         }
-         fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
-                 nbk_name[*kernel_type]);
++            *kernel_type = nbnxnk4x4_PlainC;
 +        }
 +    }
 +
 +    if (bDoNonbonded && fp != NULL)
 +    {
-         nbv->grp[i].kernel_type    = nbkNotSet;
++        fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
++                nbnxn_kernel_name[*kernel_type],
++                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
++                nbnxn_kernel_to_cj_size(*kernel_type));
 +    }
 +}
 +
 +gmx_bool uses_simple_tables(int cutoff_scheme,
 +                            nonbonded_verlet_t *nbv,
 +                            int group)
 +{
 +    gmx_bool bUsesSimpleTables = TRUE;
 +    int grp_index;
 +
 +    switch(cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        bUsesSimpleTables = TRUE;
 +        break;
 +    case ecutsVERLET:
 +        assert(NULL != nbv && NULL != nbv->grp);
 +        grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
 +        bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    return bUsesSimpleTables;
 +}
 +
 +static void init_ewald_f_table(interaction_const_t *ic,
 +                               gmx_bool bUsesSimpleTables,
 +                               real rtab)
 +{
 +    real maxr;
 +
 +    if (bUsesSimpleTables)
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +        
 +        maxr = (rtab>ic->rcoulomb) ? rtab : ic->rcoulomb;
 +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +    }
 +
 +    sfree_aligned(ic->tabq_coul_FDV0);
 +    sfree_aligned(ic->tabq_coul_F);
 +    sfree_aligned(ic->tabq_coul_V);
 +
 +    /* Create the original table data in FDV0 */
 +    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
 +    snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
 +    snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
 +    table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
 +                                ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
 +}
 +
 +void init_interaction_const_tables(FILE *fp, 
 +                                   interaction_const_t *ic,
 +                                   gmx_bool bUsesSimpleTables,
 +                                   real rtab)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_ewald_f_table(ic,bUsesSimpleTables,rtab);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale,ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE *fp, 
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec *fr,
 +                            real  rtab)
 +{
 +    interaction_const_t *ic;
 +    gmx_bool bUsesSimpleTables = TRUE;
 +
 +    snew(ic, 1);
 +
 +    /* Just allocate something so we can free it */
 +    snew_aligned(ic->tabq_coul_FDV0,16,16);
 +    snew_aligned(ic->tabq_coul_F,16,16);
 +    snew_aligned(ic->tabq_coul_V,16,16);
 +
 +    ic->rlist       = fr->rlist;
 +    ic->rlistlong   = fr->rlistlong;
 +    
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw,-6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6),ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp,", Coulomb %.3f",ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp,", Ewald %.3e",ic->sh_ewald);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
 +    init_interaction_const_tables(fp,ic,bUsesSimpleTables,rtab);
 +}
 +
 +static void init_nb_verlet(FILE *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec *ir,
 +                           const t_forcerec *fr,
 +                           const t_commrec *cr,
 +                           const char *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int  i;
 +    char *env;
 +    gmx_bool bHybridGPURun = FALSE;
 +
 +    nbnxn_alloc_t *nb_alloc;
 +    nbnxn_free_t  *nb_free;
 +
 +    snew(nbv, 1);
 +
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
-         if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
++        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                              &nbv->bUseGPU,
++                              ir,
 +                              &nbv->grp[i].kernel_type,
 +                              &nbv->grp[i].ewald_excl,
 +                              fr->bNonbonded);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt,"gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                                  NULL,
++                                  ir,
 +                                  &nbv->grp[i].kernel_type,
 +                                  &nbv->grp[i].ewald_excl,
 +                                  fr->bNonbonded);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
 +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", 
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? & cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
++        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat,1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype,fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec *fr,
 +                   t_fcdata   *fcd,
 +                   const t_inputrec *ir,
 +                   const gmx_mtop_t *mtop,
 +                   const t_commrec  *cr,
 +                   matrix     box,
 +                   gmx_bool       bMolEpot,
 +                   const char *tabfn,
 +                   const char *tabafn,
 +                   const char *tabpfn,
 +                   const char *tabbfn,
 +                   const char *nbpu_opt,
 +                   gmx_bool   bNoSolvOpt,
 +                   real       print_force)
 +{
 +    int     i,j,m,natoms,ngrp,negp_pp,negptable,egi,egj;
 +    real    rtab;
 +    char    *env;
 +    double  dbl;
 +    rvec    box_size;
 +    const t_block *cgs;
 +    gmx_bool    bGenericKernelOnly;
 +    gmx_bool    bTab,bSep14tab,bNormalnblists;
 +    t_nblists *nbl;
 +    int     *nm_ind,egp_flags;
 +    
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC,box))
 +    {
 +        gmx_fatal(FARGS,check_box(ir->ePBC,box));
 +    }
 +    
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI)) {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1]) {
 +            gmx_fatal(FARGS,"The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    } else {
 +        fr->n_tpi = 0;
 +    }
 +    
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress) {
 +      fr->adress_type     = ir->adress->type;
 +      fr->adress_const_wf = ir->adress->const_wf;
 +      fr->adress_ex_width = ir->adress->ex_width;
 +      fr->adress_hy_width = ir->adress->hy_width;
 +      fr->adress_icor     = ir->adress->icor;
 +      fr->adress_site     = ir->adress->site;
 +      fr->adress_ex_forcecap = ir->adress->ex_forcecap;
 +      fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +      snew(fr->adress_group_explicit , ir->adress->n_energy_grps);
 +      for (i=0; i< ir->adress->n_energy_grps; i++){
 +          fr->adress_group_explicit[i]= ir->adress->group_explicit[i];
 +      }
 +
 +      fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +      snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +      for (i=0; i< fr->n_adress_tf_grps; i++){
 +          fr->adress_tf_table_index[i]= ir->adress->tf_table_index[i];
 +      }
 +      copy_rvec(ir->adress->refs,fr->adress_refs);
 +    } else {
 +      fr->adress_type = eAdressOff;
 +      fr->adress_do_hybridpairs = FALSE;
 +    }
 +    
 +    /* Copy the user determined parameters */
 +    fr->userint1 = ir->userint1;
 +    fr->userint2 = ir->userint2;
 +    fr->userint3 = ir->userint3;
 +    fr->userint4 = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +    
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +    
 +    /* Free energy */
 +    fr->efep       = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min,6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power   = ir->fepvals->sc_power;
 +    fr->sc_r_power   = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma,6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env,"%lf",&dbl);
 +        fr->sc_sigma6_min = pow(dbl,6);
 +        if (fp)
 +        {
 +            fprintf(fp,"Setting the minimum soft core sigma to %g nm\n",dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr,fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +
 +    /* We now check in the NS code whether a particular combination of interactions
 +     * can be used with water optimization, and disable it if that is not the case.
 +     */
 +
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
 +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +    }
 +
 +    if (bGenericKernelOnly==TRUE)
 +    {
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir,mtop,FALSE,NULL,NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop,F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp,"\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
 +        }
 +    }
 +
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com,fr->posres_com);
 +    copy_rvec(ir->posres_comB,fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coulomb_modifier = ir->coulomb_modifier;
 +    fr->vdw_modifier     = ir->vdw_modifier;
 +
 +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
 +    switch(fr->eeltype)
 +    {
 +        case eelCUT:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_COULOMB;
 +            break;
 +
 +        case eelRF:
 +        case eelGRF:
 +        case eelRF_NEC:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            break;
 +
 +        case eelRF_ZERO:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
 +            break;
 +
 +        case eelSWITCH:
 +        case eelSHIFT:
 +        case eelUSER:
 +        case eelENCADSHIFT:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            break;
 +
 +        case eelPME:
 +        case eelEWALD:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported electrostatic interaction: %s",eel_names[fr->eeltype]);
 +            break;
 +    }
 +
 +    /* Vdw: Translate from mdp settings to kernel format */
 +    switch(fr->vdwtype)
 +    {
 +        case evdwCUT:
 +            if(fr->bBHAM)
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
 +            }
 +            else
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
 +            }
 +            break;
 +
 +        case evdwSWITCH:
 +        case evdwSHIFT:
 +        case evdwUSER:
 +        case evdwENCADSHIFT:
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported vdw interaction: %s",evdw_names[fr->vdwtype]);
 +            break;
 +    }
 +
 +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
 +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
 +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
 +
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype==eelEWALD);
 +    
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
 +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
 +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
 +                           fr->eeltype == eelEWALD ||
 +                           fr->eeltype == eelPME ||
 +                           fr->eeltype == eelRF ||
 +                           fr->eeltype == eelRF_ZERO);
 +
 +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
 +         * going to be faster to tabulate the interaction than calling the generic kernel.
 +         */
 +        if(fr->nbkernel_elec_modifier==eintmodPOTSWITCH && fr->nbkernel_vdw_modifier==eintmodPOTSWITCH)
 +        {
 +            if((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +        else if((fr->nbkernel_elec_modifier==eintmodPOTSHIFT && fr->nbkernel_vdw_modifier==eintmodPOTSHIFT) ||
 +                ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
 +                  fr->nbkernel_elec_modifier==eintmodEXACTCUTOFF &&
 +                  (fr->nbkernel_vdw_modifier==eintmodPOTSWITCH || fr->nbkernel_vdw_modifier==eintmodPOTSHIFT))))
 +        {
 +            if(fr->rcoulomb != fr->rvdw)
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
 +            fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
 +        }
 +
 +        if(fr->bvdwtab==TRUE)
 +        {
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            fr->nbkernel_vdw_modifier    = eintmodNONE;
 +        }
 +        if(fr->bcoultab==TRUE)
 +        {
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            fr->nbkernel_elec_modifier    = eintmodNONE;
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS,"Cut-off scheme %S only supports LJ repulsion power 12",ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +    
 +    /* Tables are used for direct ewald sum */
 +    if(fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +                fprintf(fp,"Will do PME sum in reciprocal space.\n");
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp,"Hockney1988");
 +                please_cite(fp,"Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp,"Essmann95a");
 +            }
 +            
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp,"Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp,"In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff=calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp,"Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +    
 +    /* Electrostatics */
 +    fr->epsilon_r  = ir->epsilon_r;
 +    fr->epsilon_rf = ir->epsilon_rf;
 +    fr->fudgeQQ    = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +    
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +    
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp,mtop,ir,fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for(m=0; (m<DIM); m++)
 +            box_size[m]=box[m][m];
 +        
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +            set_shift_consts(fp,fr->rcoulomb_switch,fr->rcoulomb,box_size,fr);
 +    }
 +    
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop,F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop,F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                      );
 +    
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm,fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +        snew(fr->shift_vec,SHIFTS);
 +    
 +    if (fr->fshift == NULL)
 +        snew(fr->fshift,SHIFTS);
 +    
 +    if (fr->nbfp == NULL) {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams,fr->bBHAM);
 +    }
 +    
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +    
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM) {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +            gmx_fatal(FARGS,"rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch,fr->rvdw);
 +        if (fp)
 +            fprintf(fp,"Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype==eelSWITCH) ? "switched":"shifted",
 +                    fr->rvdw_switch,fr->rvdw);
 +    } 
 +    
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +        gmx_fatal(FARGS,"Switch/shift interaction not supported with Buckingham");
 +    
 +    if (fp)
 +        fprintf(fp,"Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist,fr->rcoulomb,fr->bBHAM ? "BHAM":"LJ",fr->rvdw);
 +    
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp,fr,mtop);
 +    }
 +    
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp,fr,mtop);
 +    }
 +
 +    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +      fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius,fr->ntype);
 +    snew(fr->atype_vol,fr->ntype);
 +    snew(fr->atype_surftens,fr->ntype);
 +    snew(fr->atype_gb_radius,fr->ntype);
 +    snew(fr->atype_S_hct,fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_radius[i] =mtop->atomtypes.radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +    }  
 +      
 +      /* Generate the GB table if needed */
 +      if(fr->bGB)
 +      {
 +#ifdef GMX_DOUBLE
 +              fr->gbtabscale=2000;
 +#else
 +              fr->gbtabscale=500;
 +#endif
 +              
 +              fr->gbtabr=100;
 +              fr->gbtab=make_gb_table(fp,oenv,fr,tabpfn,fr->gbtabscale);
 +
 +        init_gb(&fr->born,cr,fr,ir,mtop,ir->rgbradii,ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr,fr->born,ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    else
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +        calc_rffac(fp,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    
 +    set_chargesum(fp,fr,mtop);
 +    
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +    
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +    
 +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
 +
 +    bSep14tab = ((!bTab || fr->eeltype!=eelCUT || fr->vdwtype!=evdwCUT ||
 +                  fr->bBHAM || fr->bEwald) &&
 +                 (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab) {
 +        bNormalnblists = TRUE;
 +        fr->nnblists = 1;
 +    } else {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for(egi=0; egi<negp_pp; egi++) {
 +            for(egj=egi;  egj<negp_pp; egj++) {
 +                egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL)) {
 +                    if (egp_flags & EGP_TABLE) {
 +                        negptable++;
 +                    } else {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists) {
 +            fr->nnblists = negptable + 1;
 +        } else {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +            snew(fr->gid2nblists,ir->opts.ngener*ir->opts.ngener);
 +    }
 +
 +    if (ir->adress){
 +        fr->nnblists*=2;
 +    }
 +
 +    snew(fr->nblists,fr->nnblists);
 +    
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab) {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists) {
 +            make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[0]);
 +            if (ir->adress){
 +                make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[fr->nnblists/2]);
 +            }
 +            if (!bSep14tab)
 +                fr->tab14 = fr->nblists[0].table_elec_vdw;
 +            m = 1;
 +        } else {
 +            m = 0;
 +        }
 +        if (negptable > 0) {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for(egi=0; egi<negp_pp; egi++) {
 +                for(egj=egi;  egj<negp_pp; egj++) {
 +                    egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL)) {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1) {
 +                            fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
 +                        if (ir->adress){
 +                             make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[fr->nnblists/2+m]);
 +                        }
 +                        m++;
 +                    } else if (fr->nnblists > 1) {
 +                        fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp,oenv,fr,MASTER(cr),tabpfn,rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if(fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */ 
 +        
 +        if (ir->adress->n_tf_grps > 0){
 +            make_adress_tf_tables(fp,oenv,fr,ir,tabfn, mtop, box);
 +
 +        }else{
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp,oenv,fr,tabafn, box);
 +        }
 +    }
 +    
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type==ewtTABLE)
 +    {
 +        make_wall_tables(fp,oenv,ir,tabfn,&mtop->groups,fr);
 +    }
 +    
 +    if (fcd && tabbfn) {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS,F_TABBONDSNC,
 +                                           mtop,tabbfn,"b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES,-1,
 +                                           mtop,tabbfn,"a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS,-1,
 +                                           mtop,tabbfn,"d");
 +    } else {
 +        if (debug)
 +            fprintf(debug,"No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +    }
 +    
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr,"QM/MM calculation requested.\n");
 +    }
 +    
 +    fr->bQMMM      = ir->bQMMM;   
 +    fr->qr         = mk_QMMMrec();
 +    
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr)) {
 +        fr->cginfo = NULL;
 +    } else {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock,fr->cginfo_mb);
 +    }
 +    
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr,ncg_mtop(mtop),ncg_mtop(mtop),
 +                            mtop->natoms,mtop->natoms,mtop->natoms);
 +    }
 +    
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf=0.;
 +    fr->t_wait=0.;
 +    fr->timesteps=0;
 +    
 +    /* Initialize neighbor search */
 +    init_ns(fp,cr,&fr->ns,fr,mtop,box);
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_nonbonded_setup(fp,fr,bGenericKernelOnly);
 +    /*
 +     if (ir->bAdress)
 +        {
 +            gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +        }
 +     */
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr,mtop->groups.grps[egcENER].nr);
 +    
 +    snew(fr->excl_load,fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS,"With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +    }
 +
 +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
 +    init_interaction_const(fp, &fr->ic, fr, rtab);
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp,ir->eDispCorr,fr);
 +    }
 +}
 +
 +#define pr_real(fp,r) fprintf(fp,"%s: %e\n",#r,r)
 +#define pr_int(fp,i)  fprintf((fp),"%s: %d\n",#i,i)
 +#define pr_bool(fp,b) fprintf((fp),"%s: %s\n",#b,bool_names[b])
 +
 +void pr_forcerec(FILE *fp,t_forcerec *fr,t_commrec *cr)
 +{
 +  int i;
 +
 +  pr_real(fp,fr->rlist);
 +  pr_real(fp,fr->rcoulomb);
 +  pr_real(fp,fr->fudgeQQ);
 +  pr_bool(fp,fr->bGrid);
 +  pr_bool(fp,fr->bTwinRange);
 +  /*pr_int(fp,fr->cg0);
 +    pr_int(fp,fr->hcg);*/
 +  for(i=0; i<fr->nnblists; i++)
 +    pr_int(fp,fr->nblists[i].table_elec_vdw.n);
 +  pr_real(fp,fr->rcoulomb_switch);
 +  pr_real(fp,fr->rcoulomb);
 +  
 +  fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top,const t_commrec *cr)
 +{
 +    const int *ind,*a;
 +    int t,i,j,ntot,n,ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for(i=0; i<top->excls.nr; i++)
 +    {
 +        for(j=ind[i]; j<ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n = 0;
 +    i = 0;
 +    for(t=1; t<=fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while(i < top->excls.nr && n < ntarget)
 +        {
 +            for(j=ind[i]; j<ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
 +
index f7375ce63cbce73e565b4c5e025b2fc5b2e9f27c,0000000000000000000000000000000000000000..f6268d41a9c93c5d53cd4364e141095a2f745548
mode 100644,000000..100644
--- /dev/null
@@@ -1,1302 -1,0 +1,1310 @@@
-     if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
-         nb_kernel_type == nbk4xN_X86_SIMD256)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "gmx_omp_nthreads.h"
 +
 +/* Default nbnxn allocation routine, allocates 32 byte aligned,
 + * which works for plain C and aligned SSE and AVX loads/stores.
 + */
 +void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
 +{
 +    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
 +}
 +
 +/* Free function for memory allocated with nbnxn_alloc_aligned */
 +void nbnxn_free_aligned(void *ptr)
 +{
 +    sfree_aligned(ptr);
 +}
 +
 +/* Reallocation wrapper function for nbnxn data structures */
 +void nbnxn_realloc_void(void **ptr,
 +                        int nbytes_copy,int nbytes_new,
 +                        nbnxn_alloc_t *ma,
 +                        nbnxn_free_t  *mf)
 +{
 +    void *ptr_new;
 +
 +    ma(&ptr_new,nbytes_new);
 +
 +    if (nbytes_new > 0 && ptr_new == NULL)
 +    {
 +        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
 +    }
 +
 +    if (nbytes_copy > 0)
 +    {
 +        if (nbytes_new < nbytes_copy)
 +        {
 +            gmx_incons("In nbnxn_realloc_void: new size less than copy size");
 +        }
 +        memcpy(ptr_new,*ptr,nbytes_copy);
 +    }
 +    if (*ptr != NULL)
 +    {
 +        mf(*ptr);
 +    }
 +    *ptr = ptr_new;
 +}
 +
 +/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
 +void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat,int n)
 +{
 +    int t;
 +
 +    nbnxn_realloc_void((void **)&nbat->type,
 +                       nbat->natoms*sizeof(*nbat->type),
 +                       n*sizeof(*nbat->type),
 +                       nbat->alloc,nbat->free);
 +    nbnxn_realloc_void((void **)&nbat->lj_comb,
 +                       nbat->natoms*2*sizeof(*nbat->lj_comb),
 +                       n*2*sizeof(*nbat->lj_comb),
 +                       nbat->alloc,nbat->free);
 +    if (nbat->XFormat != nbatXYZQ)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->q,
 +                           nbat->natoms*sizeof(*nbat->q),
 +                           n*sizeof(*nbat->q),
 +                           nbat->alloc,nbat->free);
 +    }
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->energrp,
 +                           nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
 +                           n/nbat->na_c*sizeof(*nbat->energrp),
 +                           nbat->alloc,nbat->free);
 +    }
 +    nbnxn_realloc_void((void **)&nbat->x,
 +                       nbat->natoms*nbat->xstride*sizeof(*nbat->x),
 +                       n*nbat->xstride*sizeof(*nbat->x),
 +                       nbat->alloc,nbat->free);
 +    for(t=0; t<nbat->nout; t++)
 +    {
 +        /* Allocate one element extra for possible signaling with CUDA */
 +        nbnxn_realloc_void((void **)&nbat->out[t].f,
 +                           nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           n*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           nbat->alloc,nbat->free);
 +    }
 +    nbat->nalloc = n;
 +}
 +
 +/* Initializes an nbnxn_atomdata_output_t data structure */
 +static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
 +                                       int nb_kernel_type,
 +                                       int nenergrp,int stride,
 +                                       nbnxn_alloc_t *ma)
 +{
 +    int cj_size;
 +
 +    out->f = NULL;
 +    ma((void **)&out->fshift,SHIFTS*DIM*sizeof(*out->fshift));
 +    out->nV = nenergrp*nenergrp;
 +    ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
 +    ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
 +
-         case nbk4xN_X86_SIMD128:
-             nbat->XFormat = nbatX4;
-             break;
-         case nbk4xN_X86_SIMD256:
- #ifndef GMX_DOUBLE
-             nbat->XFormat = nbatX8;
- #else
-             nbat->XFormat = nbatX4;
- #endif
++    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
++        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
 +    {
 +        cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +        out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
 +        ma((void **)&out->VSvdw,out->nVS*sizeof(*out->VSvdw));
 +        ma((void **)&out->VSc  ,out->nVS*sizeof(*out->VSc  ));
 +    }
 +    else
 +    {
 +        out->nVS = 0;
 +    }
 +}
 +
 +static void copy_int_to_nbat_int(const int *a,int na,int na_round,
 +                                 const int *in,int fill,int *innb)
 +{
 +    int i,j;
 +
 +    j = 0;
 +    for(i=0; i<na; i++)
 +    {
 +        innb[j++] = in[a[i]];
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for(; i<na_round; i++)
 +    {
 +        innb[j++] = fill;
 +    }
 +}
 +
 +static void clear_nbat_real(int na,int nbatFormat,real *xnb,int a0)
 +{
 +    int a,d,j,c;
 +
 +    switch (nbatFormat)
 +    {
 +    case nbatXYZ:
 +        for(a=0; a<na; a++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                xnb[(a0+a)*STRIDE_XYZ+d] = 0;
 +            }
 +        }
 +        break;
 +    case nbatXYZQ:
 +        for(a=0; a<na; a++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
 +            }
 +        }
 +        break;
 +    case nbatX4:
 +        j = X4_IND_A(a0);
 +        c = a0 & (PACK_X4-1);
 +        for(a=0; a<na; a++)
 +        {
 +            xnb[j+XX*PACK_X4] = 0;
 +            xnb[j+YY*PACK_X4] = 0;
 +            xnb[j+ZZ*PACK_X4] = 0;
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        j = X8_IND_A(a0);
 +        c = a0 & (PACK_X8-1);
 +        for(a=0; a<na; a++)
 +        {
 +            xnb[j+XX*PACK_X8] = 0;
 +            xnb[j+YY*PACK_X8] = 0;
 +            xnb[j+ZZ*PACK_X8] = 0;
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    }
 +}
 +
 +void copy_rvec_to_nbat_real(const int *a,int na,int na_round,
 +                            rvec *x,int nbatFormat,real *xnb,int a0,
 +                            int cx,int cy,int cz)
 +{
 +    int i,j,c;
 +
 +/* We might need to place filler particles to fill up the cell to na_round.
 + * The coefficients (LJ and q) for such particles are zero.
 + * But we might still get NaN as 0*NaN when distances are too small.
 + * We hope that -107 nm is far away enough from to zero
 + * to avoid accidental short distances to particles shifted down for pbc.
 + */
 +#define NBAT_FAR_AWAY 107
 +
 +    switch (nbatFormat)
 +    {
 +    case nbatXYZ:
 +        j = a0*STRIDE_XYZ;
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j++] = x[a[i]][XX];
 +            xnb[j++] = x[a[i]][YY];
 +            xnb[j++] = x[a[i]][ZZ];
 +        }
 +        /* Complete the partially filled last cell with copies of the last element.
 +         * This simplifies the bounding box calculation and avoid
 +         * numerical issues with atoms that are coincidentally close.
 +         */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +        }
 +        break;
 +    case nbatXYZQ:
 +        j = a0*STRIDE_XYZQ;
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j++] = x[a[i]][XX];
 +            xnb[j++] = x[a[i]][YY];
 +            xnb[j++] = x[a[i]][ZZ];
 +            j++;
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +        }
 +        break;
 +    case nbatX4:
 +        j = X4_IND_A(a0);
 +        c = a0 & (PACK_X4-1);
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j+XX*PACK_X4] = x[a[i]][XX];
 +            xnb[j+YY*PACK_X4] = x[a[i]][YY];
 +            xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        j = X8_IND_A(a0);
 +        c = a0 & (PACK_X8 - 1);
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j+XX*PACK_X8] = x[a[i]][XX];
 +            xnb[j+YY*PACK_X8] = x[a[i]][YY];
 +            xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    default:
 +        gmx_incons("Unsupported stride");
 +    }
 +}
 +
 +/* Determines the combination rule (or none) to be used, stores it,
 + * and sets the LJ parameters required with the rule.
 + */
 +static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
 +{
 +    int  nt,i,j;
 +    real c6,c12;
 +
 +    nt = nbat->ntype;
 +
 +    switch (nbat->comb_rule)
 +    {
 +    case  ljcrGEOM:
 +        nbat->comb_rule = ljcrGEOM;
 +
 +        for(i=0; i<nt; i++)
 +        {
 +            /* Copy the diagonal from the nbfp matrix */
 +            nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
 +            nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
 +        }
 +        break;
 +    case ljcrLB:
 +        for(i=0; i<nt; i++)
 +        {
 +            /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
 +            c6  = nbat->nbfp[(i*nt+i)*2  ];
 +            c12 = nbat->nbfp[(i*nt+i)*2+1];
 +            if (c6 > 0 && c12 > 0)
 +            {
 +                /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
 +                 * so we get 6*C6 and 12*C12 after combining.
 +                 */
 +                nbat->nbfp_comb[i*2  ] = 0.5*pow(c12/c6,1.0/6.0);
 +                nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
 +            }
 +            else
 +            {
 +                nbat->nbfp_comb[i*2  ] = 0;
 +                nbat->nbfp_comb[i*2+1] = 0;
 +            }
 +        }
 +        break;
 +    case ljcrNONE:
 +        /* In nbfp_s4 we use a stride of 4 for storing two parameters */
 +        nbat->alloc((void **)&nbat->nbfp_s4,nt*nt*4*sizeof(*nbat->nbfp_s4));
 +        for(i=0; i<nt; i++)
 +        {
 +            for(j=0; j<nt; j++)
 +            {
 +                nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
 +                nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
 +                nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
 +                nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
 +            }
 +        }
 +        break;
 +    default:
 +        gmx_incons("Unknown combination rule");
 +        break;
 +    }
 +}
 +
 +/* Initializes an nbnxn_atomdata_t data structure */
 +void nbnxn_atomdata_init(FILE *fp,
 +                         nbnxn_atomdata_t *nbat,
 +                         int nb_kernel_type,
 +                         int ntype,const real *nbfp,
 +                         int n_energygroups,
 +                         int nout,
 +                         nbnxn_alloc_t *alloc,
 +                         nbnxn_free_t  *free)
 +{
 +    int  i,j;
 +    real c6,c12,tol;
 +    char *ptr;
 +    gmx_bool simple,bCombGeom,bCombLB;
 +
 +    if (alloc == NULL)
 +    {
 +        nbat->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbat->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbat->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbat->free = free;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"There are %d atom types in the system, adding one for nbnxn_atomdata_t\n",ntype);
 +    }
 +    nbat->ntype = ntype + 1;
 +    nbat->alloc((void **)&nbat->nbfp,
 +                nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
 +    nbat->alloc((void **)&nbat->nbfp_comb,nbat->ntype*2*sizeof(*nbat->nbfp_comb));
 +
 +    /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
 +     * force-field floating point parameters.
 +     */
 +    tol = 1e-5;
 +    ptr = getenv("GMX_LJCOMB_TOL");
 +    if (ptr != NULL)
 +    {
 +        double dbl;
 +
 +        sscanf(ptr,"%lf",&dbl);
 +        tol = dbl;
 +    }
 +    bCombGeom = TRUE;
 +    bCombLB   = TRUE;
 +
 +    /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
 +     * to check for the LB rule.
 +     */
 +    for(i=0; i<ntype; i++)
 +    {
 +        c6  = nbfp[(i*ntype+i)*2  ]/6.0;
 +        c12 = nbfp[(i*ntype+i)*2+1]/12.0;
 +        if (c6 > 0 && c12 > 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = pow(c12/c6,1.0/6.0);
 +            nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
 +        }
 +        else if (c6 == 0 && c12 == 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = 0;
 +            nbat->nbfp_comb[i*2+1] = 0;
 +        }
 +        else
 +        {
 +            /* Can not use LB rule with only dispersion or repulsion */
 +            bCombLB = FALSE;
 +        }
 +    }
 +
 +    for(i=0; i<nbat->ntype; i++)
 +    {
 +        for(j=0; j<nbat->ntype; j++)
 +        {
 +            if (i < ntype && j < ntype)
 +            {
 +                /* fr->nbfp has been updated, so that array too now stores c6/c12 including
 +                 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
 +                 */
 +                c6  = nbfp[(i*ntype+j)*2  ];
 +                c12 = nbfp[(i*ntype+j)*2+1];
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = c6;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
 +
 +                /* Compare 6*C6 and 12*C12 for geometric cobination rule */
 +                bCombGeom = bCombGeom &&
 +                    gmx_within_tol(c6*c6  ,nbfp[(i*ntype+i)*2  ]*nbfp[(j*ntype+j)*2  ],tol) &&
 +                    gmx_within_tol(c12*c12,nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1],tol);
 +
 +                /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
 +                c6  /= 6.0;
 +                c12 /= 12.0;
 +                bCombLB = bCombLB &&
 +                    ((c6 == 0 && c12 == 0 &&
 +                      (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
 +                     (c6 > 0 && c12 > 0 &&
 +                      gmx_within_tol(pow(c12/c6,1.0/6.0),0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]),tol) &&
 +                      gmx_within_tol(0.25*c6*c6/c12,sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]),tol)));
 +            }
 +            else
 +            {
 +                /* Add zero parameters for the additional dummy atom type */
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = 0;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
 +            }
 +        }
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Combination rules: geometric %d Lorentz-Berthelot %d\n",
 +                bCombGeom,bCombLB);
 +    }
 +
 +    simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    if (simple)
 +    {
 +        /* We prefer the geometic combination rule,
 +         * as that gives a slightly faster kernel than the LB rule.
 +         */
 +        if (bCombGeom)
 +        {
 +            nbat->comb_rule = ljcrGEOM;
 +        }
 +        else if (bCombLB)
 +        {
 +            nbat->comb_rule = ljcrLB;
 +        }
 +        else
 +        {
 +            nbat->comb_rule = ljcrNONE;
 +
 +            nbat->free(nbat->nbfp_comb);
 +        }
 +
 +        if (fp)
 +        {
 +            if (nbat->comb_rule == ljcrNONE)
 +            {
 +                fprintf(fp,"Using full Lennard-Jones parameter combination matrix\n\n");
 +            }
 +            else
 +            {
 +                fprintf(fp,"Using %s Lennard-Jones combination rule\n\n",
 +                        nbat->comb_rule==ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
 +            }
 +        }
 +
 +        set_combination_rule_data(nbat);
 +    }
 +    else
 +    {
 +        nbat->comb_rule = ljcrNONE;
 +
 +        nbat->free(nbat->nbfp_comb);
 +    }
 +
 +    nbat->natoms  = 0;
 +    nbat->type    = NULL;
 +    nbat->lj_comb = NULL;
 +    if (simple)
 +    {
++        int pack_x;
++
 +        switch (nb_kernel_type)
 +        {
- #include "gmx_x86_simd_macros.h"
++        case nbnxnk4xN_SIMD_4xN:
++        case nbnxnk4xN_SIMD_2xNN:
++            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
++                         nbnxn_kernel_to_cj_size(nb_kernel_type));
++            switch (pack_x)
++            {
++                case 4:
++                    nbat->XFormat = nbatX4;
++                    break;
++                case 8:
++                    nbat->XFormat = nbatX8;
++                    break;
++                default:
++                    gmx_incons("Unsupported packing width");
++            }
 +            break;
 +        default:
 +            nbat->XFormat = nbatXYZ;
 +            break;
 +        }
 +
 +        nbat->FFormat = nbat->XFormat;
 +    }
 +    else
 +    {
 +        nbat->XFormat = nbatXYZQ;
 +        nbat->FFormat = nbatXYZ;
 +    }
 +    nbat->q       = NULL;
 +    nbat->nenergrp = n_energygroups;
 +    if (!simple)
 +    {
 +        /* Energy groups not supported yet for super-sub lists */
 +        if (n_energygroups > 1 && fp != NULL)
 +        {
 +            fprintf(fp,"\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
 +        }
 +        nbat->nenergrp = 1;
 +    }
 +    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
 +    if (nbat->nenergrp > 64)
 +    {
 +        gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
 +    }
 +    nbat->neg_2log = 1;
 +    while (nbat->nenergrp > (1<<nbat->neg_2log))
 +    {
 +        nbat->neg_2log++;
 +    }
 +    nbat->energrp = NULL;
 +    nbat->alloc((void **)&nbat->shift_vec,SHIFTS*sizeof(*nbat->shift_vec));
 +    nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->x       = NULL;
 +    nbat->nout    = nout;
 +    snew(nbat->out,nbat->nout);
 +    nbat->nalloc  = 0;
 +    for(i=0; i<nbat->nout; i++)
 +    {
 +        nbnxn_atomdata_output_init(&nbat->out[i],
 +                                   nb_kernel_type,
 +                                   nbat->nenergrp,1<<nbat->neg_2log,
 +                                   nbat->alloc);
 +    }
 +    nbat->buffer_flags.flag        = NULL;
 +    nbat->buffer_flags.flag_nalloc = 0;
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
 +                                       const int *type,int na,
 +                                       real *ljparam_at)
 +{
 +    int is,k,i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for(is=0; is<na; is+=PACK_X4)
 +    {
 +        for(k=0; k<PACK_X4; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
 +                                       const int *type,int na,
 +                                       real *ljparam_at)
 +{
 +    int is,k,i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for(is=0; is<na; is+=PACK_X8)
 +    {
 +        for(k=0; k<PACK_X8; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +/* Sets the atom type and LJ data in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat,
 +                                         int ngrid,
 +                                         const nbnxn_search_t nbs,
 +                                         const int *type)
 +{
 +    int g,i,ncz,ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(i=0; i<grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_int_to_nbat_int(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
 +                                 type,nbat->ntype-1,nbat->type+ash);
 +
 +            if (nbat->comb_rule != ljcrNONE)
 +            {
 +                if (nbat->XFormat == nbatX4)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
 +                                               nbat->type+ash,ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +                else if (nbat->XFormat == nbatX8)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
 +                                               nbat->type+ash,ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Sets the charges in nbnxn_atomdata_t *nbat */
 +static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat,
 +                                       int ngrid,
 +                                       const nbnxn_search_t nbs,
 +                                       const real *charge)
 +{
 +    int  g,cxy,ncz,ash,na,na_round,i,j;
 +    real *q;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(cxy=0; cxy<grid->ncx*grid->ncy; cxy++)
 +        {
 +            ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +            na  = grid->cxy_na[cxy];
 +            na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +
 +            if (nbat->XFormat == nbatXYZQ)
 +            {
 +                q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
 +                for(i=0; i<na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q += STRIDE_XYZQ;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for(; i<na_round; i++)
 +                {
 +                    *q = 0;
 +                    q += STRIDE_XYZQ;
 +                }
 +            }
 +            else
 +            {
 +                q = nbat->q + ash;
 +                for(i=0; i<na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q++;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for(; i<na_round; i++)
 +                {
 +                    *q = 0;
 +                    q++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Copies the energy group indices to a reordered and packed array */
 +static void copy_egp_to_nbat_egps(const int *a,int na,int na_round,
 +                                  int na_c,int bit_shift,
 +                                  const int *in,int *innb)
 +{
 +    int i,j,sa,at;
 +    int comb;
 +
 +    j = 0;
 +    for(i=0; i<na; i+=na_c)
 +    {
 +        /* Store na_c energy group numbers into one int */
 +        comb = 0;
 +        for(sa=0; sa<na_c; sa++)
 +        {
 +            at = a[i+sa];
 +            if (at >= 0)
 +            {
 +                comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
 +            }
 +        }
 +        innb[j++] = comb;
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for(; i<na_round; i+=na_c)
 +    {
 +        innb[j++] = 0;
 +    }
 +}
 +
 +/* Set the energy group indices for atoms in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat,
 +                                            int ngrid,
 +                                            const nbnxn_search_t nbs,
 +                                            const int *atinfo)
 +{
 +    int g,i,ncz,ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(i=0; i<grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_egp_to_nbat_egps(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
 +                                  nbat->na_c,nbat->neg_2log,
 +                                  atinfo,nbat->energrp+(ash>>grid->na_c_2log));
 +        }
 +    }
 +}
 +
 +/* Sets all required atom parameter data in nbnxn_atomdata_t */
 +void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
 +                        int locality,
 +                        const nbnxn_search_t nbs,
 +                        const t_mdatoms *mdatoms,
 +                        const int *atinfo)
 +{
 +    int ngrid;
 +
 +    if (locality == eatLocal)
 +    {
 +        ngrid = 1;
 +    }
 +    else
 +    {
 +        ngrid = nbs->ngrid;
 +    }
 +
 +    nbnxn_atomdata_set_atomtypes(nbat,ngrid,nbs,mdatoms->typeA);
 +
 +    nbnxn_atomdata_set_charges(nbat,ngrid,nbs,mdatoms->chargeA);
 +
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_atomdata_set_energygroups(nbat,ngrid,nbs,atinfo);
 +    }
 +}
 +
 +/* Copies the shift vector array to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox,
 +                                   rvec *shift_vec,
 +                                   nbnxn_atomdata_t *nbat)
 +{
 +    int i;
 +
 +    nbat->bDynamicBox = bDynamicBox;
 +    for(i=0; i<SHIFTS; i++)
 +    {
 +        copy_rvec(shift_vec[i],nbat->shift_vec[i]);
 +    }
 +}
 +
 +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
 +                                      int locality,
 +                                      gmx_bool FillLocal,
 +                                      rvec *x,
 +                                      nbnxn_atomdata_t *nbat)
 +{
 +    int g0=0,g1=0;
 +    int nth,th;
 +
 +    switch (locality)
 +    {
 +    case eatAll:
 +        g0 = 0;
 +        g1 = nbs->ngrid;
 +        break;
 +    case eatLocal:
 +        g0 = 0;
 +        g1 = 1;
 +        break;
 +    case eatNonlocal:
 +        g0 = 1;
 +        g1 = nbs->ngrid;
 +        break;
 +    }
 +
 +    if (FillLocal)
 +    {
 +        nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for(th=0; th<nth; th++)
 +    {
 +        int g;
 +
 +        for(g=g0; g<g1; g++)
 +        {
 +            const nbnxn_grid_t *grid;
 +            int cxy0,cxy1,cxy;
 +
 +            grid = &nbs->grid[g];
 +
 +            cxy0 = (grid->ncx*grid->ncy* th   +nth-1)/nth;
 +            cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
 +
 +            for(cxy=cxy0; cxy<cxy1; cxy++)
 +            {
 +                int na,ash,na_fill;
 +
 +                na  = grid->cxy_na[cxy];
 +                ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +                if (g == 0 && FillLocal)
 +                {
 +                    na_fill =
 +                        (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +                }
 +                else
 +                {
 +                    /* We fill only the real particle locations.
 +                     * We assume the filling entries at the end have been
 +                     * properly set before during ns.
 +                     */
 +                    na_fill = na;
 +                }
 +                copy_rvec_to_nbat_real(nbs->a+ash,na,na_fill,x,
 +                                       nbat->XFormat,nbat->x,ash,
 +                                       0,0,0);
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_clear_reals(real * gmx_restrict dest,
 +                           int i0, int i1)
 +{
 +    int i;
 +
 +    for(i=i0; i<i1; i++)
 +    {
 +        dest[i] = 0;
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
 +                            gmx_bool bDestSet,
 +                            real ** gmx_restrict src,
 +                            int nsrc,
 +                            int i0, int i1)
 +{
 +    int i,s;
 +
 +    if (bDestSet)
 +    {
 +        /* The destination buffer contains data, add to it */
 +        for(i=i0; i<i1; i++)
 +        {
 +            for(s=0; s<nsrc; s++)
 +            {
 +                dest[i] += src[s][i];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* The destination buffer is unitialized, set it first */
 +        for(i=i0; i<i1; i++)
 +        {
 +            dest[i] = src[0][i];
 +            for(s=1; s<nsrc; s++)
 +            {
 +                dest[i] += src[s][i];
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
 +                                     gmx_bool bDestSet,
 +                                     real ** gmx_restrict src,
 +                                     int nsrc,
 +                                     int i0, int i1)
 +{
 +#ifdef NBNXN_SEARCH_SSE
 +/* We can use AVX256 here, but not when AVX128 kernels are selected.
 + * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
 + */
 +#ifdef GMX_X86_AVX_256
 +#define GMX_MM256_HERE
 +#else
 +#define GMX_MM128_HERE
 +#endif
-         for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
++#include "gmx_simd_macros.h"
 +
 +    int       i,s;
 +    gmx_mm_pr dest_SSE,src_SSE;
 +
 +    if (bDestSet)
 +    {
-         for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
++        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
 +        {
 +            dest_SSE = gmx_load_pr(dest+i);
 +            for(s=0; s<nsrc; s++)
 +            {
 +                src_SSE  = gmx_load_pr(src[s]+i);
 +                dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
 +            }
 +            gmx_store_pr(dest+i,dest_SSE);
 +        }
 +    }
 +    else
 +    {
++        for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
 +        {
 +            dest_SSE = gmx_load_pr(src[0]+i);
 +            for(s=1; s<nsrc; s++)
 +            {
 +                src_SSE  = gmx_load_pr(src[s]+i);
 +                dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
 +            }
 +            gmx_store_pr(dest+i,dest_SSE);
 +        }
 +    }
 +
 +#undef GMX_MM128_HERE
 +#undef GMX_MM256_HERE
 +#endif
 +}
 +
 +/* Add part of the force array(s) from nbnxn_atomdata_t to f */
 +static void
 +nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    nbnxn_atomdata_output_t *out,
 +                                    int nfa,
 +                                    int a0,int a1,
 +                                    rvec *f)
 +{
 +    int  a,i,fa;
 +    const int  *cell;
 +    const real *fnb;
 +
 +    cell = nbs->cell;
 +
 +    /* Loop over all columns and copy and fill */
 +    switch (nbat->FFormat)
 +    {
 +    case nbatXYZ:
 +    case nbatXYZQ:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = cell[a]*nbat->fstride;
 +
 +                f[a][XX] += fnb[i];
 +                f[a][YY] += fnb[i+1];
 +                f[a][ZZ] += fnb[i+2];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = cell[a]*nbat->fstride;
 +
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i];
 +                    f[a][YY] += out[fa].f[i+1];
 +                    f[a][ZZ] += out[fa].f[i+2];
 +                }
 +            }
 +        }
 +        break;
 +    case nbatX4:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X4_IND_A(cell[a]);
 +
 +                f[a][XX] += fnb[i+XX*PACK_X4];
 +                f[a][YY] += fnb[i+YY*PACK_X4];
 +                f[a][ZZ] += fnb[i+ZZ*PACK_X4];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X4_IND_A(cell[a]);
 +                
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i+XX*PACK_X4];
 +                    f[a][YY] += out[fa].f[i+YY*PACK_X4];
 +                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
 +                }
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X8_IND_A(cell[a]);
 +
 +                f[a][XX] += fnb[i+XX*PACK_X8];
 +                f[a][YY] += fnb[i+YY*PACK_X8];
 +                f[a][ZZ] += fnb[i+ZZ*PACK_X8];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X8_IND_A(cell[a]);
 +                
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i+XX*PACK_X8];
 +                    f[a][YY] += out[fa].f[i+YY*PACK_X8];
 +                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
 +                }
 +            }
 +        }
 +        break;
 +    }
 +}
 +
 +/* Add the force array(s) from nbnxn_atomdata_t to f */
 +void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
 +                                    int locality,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    rvec *f)
 +{
 +    int a0=0,na=0;
 +    int nth,th;
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCreducef]);
 +
 +    switch (locality)
 +    {
 +    case eatAll:
 +        a0 = 0;
 +        na = nbs->natoms_nonlocal;
 +        break;
 +    case eatLocal:
 +        a0 = 0;
 +        na = nbs->natoms_local;
 +        break;
 +    case eatNonlocal:
 +        a0 = nbs->natoms_local;
 +        na = nbs->natoms_nonlocal - nbs->natoms_local;
 +        break;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (nbat->nout > 1)
 +    {
 +        if (locality != eatAll)
 +        {
 +            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
 +        }
 +
 +        /* Reduce the force thread output buffers into buffer 0, before adding
 +         * them to the, differently ordered, "real" force buffer.
 +         */
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +        for(th=0; th<nth; th++)
 +        {
 +            const nbnxn_buffer_flags_t *flags;
 +            int b0,b1,b;
 +            int i0,i1;
 +            int nfptr;
 +            real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
 +            int out;
 +
 +            flags = &nbat->buffer_flags;
 +
 +            /* Calculate the cell-block range for our thread */
 +            b0 = (flags->nflag* th   )/nth;
 +            b1 = (flags->nflag*(th+1))/nth;
 +
 +            for(b=b0; b<b1; b++)
 +            {
 +                i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
 +                i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
 +
 +                nfptr = 0;
 +                for(out=1; out<nbat->nout; out++)
 +                {
 +                    if (flags->flag[b] & (1U<<out))
 +                    {
 +                        fptr[nfptr++] = nbat->out[out].f;
 +                    }
 +                }
 +                if (nfptr > 0)
 +                {
 +#ifdef NBNXN_SEARCH_SSE
 +                    nbnxn_atomdata_reduce_reals_x86_simd
 +#else
 +                    nbnxn_atomdata_reduce_reals
 +#endif
 +                                               (nbat->out[0].f,
 +                                                flags->flag[b] & (1U<<0),
 +                                                fptr,nfptr,
 +                                                i0,i1);
 +                }
 +                else if (!(flags->flag[b] & (1U<<0)))
 +                {
 +                    nbnxn_atomdata_clear_reals(nbat->out[0].f,
 +                                               i0,i1);
 +                }
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for(th=0; th<nth; th++)
 +    {
 +        nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
 +                                            nbat->out,
 +                                            1,
 +                                            a0+((th+0)*na)/nth,
 +                                            a0+((th+1)*na)/nth,
 +                                            f);
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
 +}
 +
 +/* Adds the shift forces from nbnxn_atomdata_t to fshift */
 +void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
 +                                              rvec *fshift)
 +{
 +    const nbnxn_atomdata_output_t *out;
 +    int  th;
 +    int  s;
 +    rvec sum;
 +
 +    out = nbat->out;
 +    
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        clear_rvec(sum);
 +        for(th=0; th<nbat->nout; th++)
 +        {
 +            sum[XX] += out[th].fshift[s*DIM+XX];
 +            sum[YY] += out[th].fshift[s*DIM+YY];
 +            sum[ZZ] += out[th].fshift[s*DIM+ZZ];
 +        }
 +        rvec_inc(fshift[s],sum);
 +    }
 +}
index 48722c680dd8be55558185ad908efb0b5ac91982,0000000000000000000000000000000000000000..a9b8e2d4419283b62dc1de822ab2e6428e0f1e75
mode 100644,000000..100644
--- /dev/null
@@@ -1,242 -1,0 +1,244 @@@
- #ifdef NBNXN_SEARCH_SSE
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_internal_h
 +#define _nbnxn_internal_h
 +
 +#include "typedefs.h"
 +#include "domdec.h"
 +#include "gmx_cyclecounter.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +#define NBNXN_SEARCH_SSE
 +#endif
 +
 +
 +/* A pair-search grid struct for one domain decomposition zone */
 +typedef struct {
 +    rvec c0;             /* The lower corner of the (local) grid        */
 +    rvec c1;             /* The upper corner of the (local) grid        */
 +    real atom_density;   /* The atom number density for the local grid  */
 +
 +    gmx_bool bSimple;    /* Is this grid simple or super/sub            */
 +    int  na_c;           /* Number of atoms per cluster                 */
 +    int  na_cj;          /* Number of atoms for list j-clusters         */
 +    int  na_sc;          /* Number of atoms per super-cluster           */
 +    int  na_c_2log;      /* 2log of na_c                                */
 +
 +    int  ncx;            /* Number of (super-)cells along x             */
 +    int  ncy;            /* Number of (super-)cells along y             */
 +    int  nc;             /* Total number of (super-)cells               */
 +
 +    real sx;             /* x-size of a (super-)cell                    */
 +    real sy;             /* y-size of a (super-)cell                    */
 +    real inv_sx;         /* 1/sx                                        */
 +    real inv_sy;         /* 1/sy                                        */
 +
 +    int  cell0;          /* Index in nbs->cell corresponding to cell 0  */
 +
 +    int  *cxy_na;        /* The number of atoms for each column in x,y  */
 +    int  *cxy_ind;       /* Grid (super)cell index, offset from cell0   */
 +    int  cxy_nalloc;     /* Allocation size for cxy_na and cxy_ind      */
 +
 +    int   *nsubc;        /* The number of sub cells for each super cell */
 +    float *bbcz;         /* Bounding boxes in z for the super cells     */
 +    float *bb;           /* 3D bounding boxes for the sub cells         */
 +    float *bbj;          /* 3D j-b.boxes for SSE-double or AVX-single   */
 +    int   *flags;        /* Flag for the super cells                    */
 +    int   nc_nalloc;     /* Allocation size for the pointers above      */
 +
 +    float *bbcz_simple;  /* bbcz for simple grid converted from super   */
 +    float *bb_simple;    /* bb for simple grid converted from super     */
 +    int   *flags_simple; /* flags for simple grid converted from super  */
 +    int   nc_nalloc_simple; /* Allocation size for the pointers above   */
 +
 +    int  nsubc_tot;      /* Total number of subcell, used for printing  */
 +} nbnxn_grid_t;
 +
- #include "gmx_x86_simd_macros.h"
- typedef struct nbnxn_x_ci_x86_simd128 {
++#ifdef GMX_NBNXN_SIMD
++#if GMX_NBNXN_SIMD_BITWIDTH == 128
 +#define GMX_MM128_HERE
- } nbnxn_x_ci_x86_simd128_t;
- #undef GMX_MM128_HERE
- #ifdef GMX_X86_AVX_256
- #define GMX_MM256_HERE
- #include "gmx_x86_simd_macros.h"
- typedef struct nbnxn_x_ci_x86_simd256 {
++#else
++#if GMX_NBNXN_SIMD_BITWIDTH == 256
++#define GMX_MM256_HERE
++#else
++#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++#endif
++#endif
++#include "gmx_simd_macros.h"
++
++typedef struct nbnxn_x_ci_simd_4xn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
 +    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
 +    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
 +    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-     gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
++} nbnxn_x_ci_simd_4xn_t;
++
++typedef struct nbnxn_x_ci_simd_2xnn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
-     gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
- } nbnxn_x_ci_x86_simd256_t;
- #undef GMX_MM256_HERE
- #endif
 +    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
- #ifdef NBNXN_SEARCH_SSE
-     nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
- #ifdef GMX_X86_AVX_256
-     nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
- #endif
++} nbnxn_x_ci_simd_2xnn_t;
++
 +#endif
 +
 +/* Working data for the actual i-supercell during pair search */
 +typedef struct nbnxn_list_work {
 +    gmx_cache_protect_t cp0; /* Protect cache between threads               */
 +
 +    float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
 +    real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
- #ifdef NBNXN_SEARCH_SSE
- static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
- #ifdef GMX_X86_AVX_256
- static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
- #endif
++#ifdef GMX_NBNXN_SIMD
++    nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
++    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 +#endif
 +    int  cj_ind;       /* The current cj_ind index for the current list     */
 +    int  cj4_init;     /* The first unitialized cj4 block                   */
 +
 +    float *d2;         /* Bounding box distance work array                  */
 +
 +    nbnxn_cj_t *cj;    /* The j-cell list                                   */
 +    int  cj_nalloc;    /* Allocation size of cj                             */
 +
 +    int ncj_noq;       /* Nr. of cluster pairs without Coul for flop count  */
 +    int ncj_hlj;       /* Nr. of cluster pairs with 1/2 LJ for flop count   */
 +
 +    gmx_cache_protect_t cp1; /* Protect cache between threads               */
 +} nbnxn_list_work_t;
 +
 +/* Function type for setting the i-atom coordinate working data */
 +typedef void
 +gmx_icell_set_x_t(int ci,
 +                  real shx,real shy,real shz,
 +                  int na_c,
 +                  int stride,const real *x,
 +                  nbnxn_list_work_t *work);
 +
 +static gmx_icell_set_x_t icell_set_x_simple;
++#ifdef GMX_NBNXN_SIMD
++static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
++static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 +#endif
 +static gmx_icell_set_x_t icell_set_x_supersub;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 +#endif
 +
++#undef GMX_MM128_HERE
++#undef GMX_MM256_HERE
++
 +/* Local cycle count struct for profiling */
 +typedef struct {
 +    int          count;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +} nbnxn_cycle_t;
 +
 +/* Local cycle count enum for profiling */
 +enum { enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr };
 +
 +/* Thread-local work struct, contains part of nbnxn_grid_t */
 +typedef struct {
 +    gmx_cache_protect_t cp0;
 +
 +    int *cxy_na;
 +    int cxy_na_nalloc;
 +
 +    int  *sort_work;
 +    int  sort_work_nalloc;
 +
 +    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
 +
 +    int  ndistc;         /* Number of distance checks for flop counting */
 +
 +    nbnxn_cycle_t cc[enbsCCnr];
 +
 +    gmx_cache_protect_t cp1;
 +} nbnxn_search_work_t;
 +
 +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
 +typedef struct nbnxn_search {
 +    int  ePBC;            /* PBC type enum                              */
 +    matrix box;           /* The periodic unit-cell                     */
 +
 +    gmx_bool DomDec;      /* Are we doing domain decomposition?         */
 +    ivec dd_dim;          /* Are we doing DD in x,y,z?                  */
 +    gmx_domdec_zones_t *zones; /* The domain decomposition zones        */
 +
 +    int  ngrid;           /* The number of grids, equal to #DD-zones    */
 +    nbnxn_grid_t *grid;   /* Array of grids, size ngrid                 */
 +    int  *cell;           /* Actual allocated cell array for all grids  */
 +    int  cell_nalloc;     /* Allocation size of cell                    */
 +    int  *a;              /* Atom index for grid, the inverse of cell   */
 +    int  a_nalloc;        /* Allocation size of a                       */
 +
 +    int  natoms_local;    /* The local atoms run from 0 to natoms_local */
 +    int  natoms_nonlocal; /* The non-local atoms run from natoms_local
 +                           * to natoms_nonlocal */
 +
 +    gmx_bool print_cycles;
 +    int      search_count;
 +    nbnxn_cycle_t cc[enbsCCnr];
 +
 +    gmx_icell_set_x_t *icell_set_x; /* Function for setting i-coords    */
 +
 +    int  nthread_max;     /* Maximum number of threads for pair-search  */
 +    nbnxn_search_work_t *work; /* Work array, size nthread_max          */
 +} nbnxn_search_t_t;
 +
 +
 +static void nbs_cycle_start(nbnxn_cycle_t *cc)
 +{
 +    cc->start = gmx_cycles_read();
 +}
 +
 +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
 +{
 +    cc->c += gmx_cycles_read() - cc->start;
 +    cc->count++;
 +}
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 0000000000000000000000000000000000000000,c3a6b3b6ee90a4ef70c3f824e7c483692df68272..c3a6b3b6ee90a4ef70c3f824e7c483692df68272
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,67e63b1b49380b11d1de1cee0d9b236709de57a1..67e63b1b49380b11d1de1cee0d9b236709de57a1
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,cab66c3e346310461148ab9489e42a5cde9eab5b..cab66c3e346310461148ab9489e42a5cde9eab5b
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,faa445efbfb1fb2ef9d67becdbc88ed1a9b46de2..faa445efbfb1fb2ef9d67becdbc88ed1a9b46de2
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,a9d4d19802adc75155db56133959ecdaf9a74056..a9d4d19802adc75155db56133959ecdaf9a74056
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,07da218f247e9987683ef2d43557ee7dd28210da..07da218f247e9987683ef2d43557ee7dd28210da
mode 000000,100644..100644
--- /dev/null
index 56ac987c6ee04bb8809973264a8d7dac10bfdfbb,0000000000000000000000000000000000000000..c0f08bd5614733664b035f7cce225a61265f2cac
mode 100644,000000..100644
--- /dev/null
@@@ -1,4925 -1,0 +1,4942 @@@
- #include "gmx_x86_simd_single.h"
- #else
- #include "gmx_x86_simd_double.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
 +/* Pair search box lower and upper corner in x,y,z.
 + * Store this in 4 iso 3 reals, which is useful with SSE.
 + * To avoid complicating the code we also use 4 without SSE.
 + */
 +#define NNBSBB_C         4
 +#define NNBSBB_B         (2*NNBSBB_C)
 +/* Pair search box lower and upper bound in z only. */
 +#define NNBSBB_D         2
 +/* Pair search box lower and upper corner x,y,z indices */
 +#define BBL_X  0
 +#define BBL_Y  1
 +#define BBL_Z  2
 +#define BBU_X  4
 +#define BBU_Y  5
 +#define BBU_Z  6
 +
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +#ifndef GMX_DOUBLE
 +#define NBNXN_SEARCH_SSE_SINGLE
- #ifndef GMX_DOUBLE
- /* 128 bits can hold 4 floats */
- #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
- #define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
- #define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
- /* 256 bits can hold 8 floats */
- #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
- #define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
- #define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
 +#endif
 +
++/* Include basic SSE2 stuff */
++#include <emmintrin.h>
++
 +#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
 +#define NBNXN_8BB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_8BB        4
 +#define STRIDE_8BB_2LOG   2
 +
++#endif /* NBNXN_SEARCH_SSE */
++
++#ifdef GMX_NBNXN_SIMD
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
- /* 128 bits can hold 2 doubles */
- #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
- #define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
- #define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
- /* 256 bits can hold 4 doubles */
- #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
- #define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
- #define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
++#if GMX_NBNXN_SIMD_BITWIDTH == 128
++#ifdef GMX_DOUBLE
++#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
++#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
++#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
++#else
++#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
++#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
++#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
++#endif
++#else
++#if GMX_NBNXN_SIMD_BITWIDTH == 256
++#ifdef GMX_DOUBLE
++#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
++#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
++#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
++#else
++#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
++#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
++#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
++/* Half SIMD with j-cluster size */
++#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
++#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
++#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
++#endif
 +#else
- #endif /* NBNXN_SEARCH_SSE */
++#error "unsupported GMX_NBNXN_SIMD_WIDTH"
++#endif
 +#endif
 +
-     case nbk4x4_PlainC:
-     case nbk4xN_X86_SIMD128:
-     case nbk4xN_X86_SIMD256:
++#endif /* GMX_NBNXN_SIMD */
 +
 +
 +/* Interaction masks for 4xN atom interactions.
 + * Bit i*CJ_SIZE + j tells if atom i and j interact.
 + */
 +/* All interaction mask is the same for all kernels */
 +#define NBNXN_INT_MASK_ALL        0xffffffff
 +/* 4x4 kernel diagonal mask */
 +#define NBNXN_INT_MASK_DIAG       0x08ce
 +/* 4x2 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
 +#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
 +/* 4x8 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
 +#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 +
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for(i=0; i<enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp,"\n");
 +    fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp," comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp," s. th");
 +        for(t=0; t<nbs->nthread_max; t++)
 +        {
 +            fprintf(fp," %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp,"\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
-     case nbk8x8x8_CUDA:
-     case nbk8x8x8_PlainC:
++    case nbnxnk4x4_PlainC:
++    case nbnxnk4xN_SIMD_4xN:
++    case nbnxnk4xN_SIMD_2xNN:
 +        return NBNXN_CPU_CLUSTER_I_SIZE;
-     case nbk4x4_PlainC:
-         return NBNXN_CPU_CLUSTER_I_SIZE;
-     case nbk4xN_X86_SIMD128:
-         /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
-         return 16/sizeof(real);
-     case nbk4xN_X86_SIMD256:
-         /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
-         return 32/sizeof(real);
-     case nbk8x8x8_CUDA:
-     case nbk8x8x8_PlainC:
-         return nbnxn_kernel_to_ci_size(nb_kernel_type);
++    case nbnxnk8x8x8_CUDA:
++    case nbnxnk8x8x8_PlainC:
 +        /* The cluster size for super/sub lists is only set here.
 +         * Any value should work for the pair-search and atomdata code.
 +         * The kernels, of course, might require a particular value.
 +         */
 +        return NBNXN_GPU_CLUSTER_SIZE;
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
++    int nbnxn_simd_width=0;
++    int cj_size=0;
++
++#ifdef GMX_NBNXN_SIMD
++    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
++#endif
++
 +    switch (nb_kernel_type)
 +    {
-     return 0;
++    case nbnxnk4x4_PlainC:
++        cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
++        break;
++    case nbnxnk4xN_SIMD_4xN:
++        cj_size = nbnxn_simd_width;
++        break;
++    case nbnxnk4xN_SIMD_2xNN:
++        cj_size = nbnxn_simd_width/2;
++        break;
++    case nbnxnk8x8x8_CUDA:
++    case nbnxnk8x8x8_PlainC:
++        cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
++        break;
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
-     if (nb_kernel_type == nbkNotSet)
++    return cj_size;
 +}
 +
 +static int ci_to_cj(int na_cj_2log,int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +    case 2: return  ci;     break;
 +    case 1: return (ci<<1); break;
 +    case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
-     case nbk8x8x8_CUDA:
-     case nbk8x8x8_PlainC:
++    if (nb_kernel_type == nbnxnkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
-     case nbk4x4_PlainC:
-     case nbk4xN_X86_SIMD128:
-     case nbk4xN_X86_SIMD256:
++    case nbnxnk8x8x8_CUDA:
++    case nbnxnk8x8x8_PlainC:
 +        return FALSE;
 +
-     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
++    case nbnxnk4x4_PlainC:
++    case nbnxnk4xN_SIMD_4xN:
++    case nbnxnk4xN_SIMD_2xNN:
 +        return TRUE;
 +
 +    default:
 +        gmx_incons("Invalid nonbonded kernel type passed!");
 +        return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
 +                       ivec *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int d,g,t;
 +
 +    snew(nbs,1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid,nbs->ngrid);
 +    for(g=0; g<nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work,nbs->nthread_max);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n,rvec corner0,rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int n,rvec corner0,rvec corner1,
 +                            real atom_density,
 +                            int XFormat)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens,tlen,tlen_x,tlen_y,nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c,grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1,(int)(size[XX]/tlen_x));
 +        grid->ncy = max(1,(int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na,grid->cxy_nalloc);
 +        srenew(grid->cxy_ind,grid->cxy_nalloc+1);
 +    }
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
 +        int bb_nalloc;
 +
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc,grid->nc_nalloc);
 +        srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
 +#ifdef NBNXN_8BB_SSE
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
 +#else
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 +#endif
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
 +        snew_aligned(grid->bb,bb_nalloc,16);
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
 +                snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
 +            }
 +        }
 +
 +        srenew(grid->flags,grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0,grid->c0);
 +    copy_rvec(corner1,grid->c1);
 +    grid->sx = size[XX]/grid->ncx;
 +    grid->sy = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    return nc_max;
 +}
 +
 +#define SORT_GRID_OVERSIZE 2
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +static void sort_atoms(int dim,gmx_bool Backwards,
 +                       int *a,int n,rvec *x,
 +                       real h0,real invh,int nsort,int *sort)
 +{
 +    int i,c;
 +    int zi,zim;
 +    int cp,tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +    /* For small oversize factors clearing the whole area is fastest.
 +     * For large oversize we should clear the used elements after use.
 +     */
 +    for(i=0; i<nsort; i++)
 +    {
 +        sort[i] = -1;
 +    }
 +    /* Sort the particles using a simple index sort */
 +    for(i=0; i<n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +        if (zi < 0 || zi >= nsort)
 +        {
 +            gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
 +                      a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                    (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                     a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp = sort[zim];
 +                    sort[zim] = cp;
 +                    cp  = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +            }
 +            sort[zi] = a[i];
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for(zi=0; zi<nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(zi=nsort-1; zi>=0; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
 +static void calc_bounding_box(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X4]);
 +        xh = max(xh,x[j+XX*PACK_X4]);
 +        yl = min(yl,x[j+YY*PACK_X4]);
 +        yh = max(yh,x[j+YY*PACK_X4]);
 +        zl = min(zl,x[j+ZZ*PACK_X4]);
 +        zh = max(zh,x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X8]);
 +        xh = max(xh,x[j+XX*PACK_X8]);
 +        yl = min(yl,x[j+YY*PACK_X8]);
 +        yh = max(yh,x[j+YY*PACK_X8]);
 +        zl = min(zl,x[j+ZZ*PACK_X8]);
 +        zh = max(zh,x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na,const real *x,
 +                                          float *bb,float *bbj)
 +{
 +    calc_bounding_box_x_x4(min(na,2),x,bbj);
 +
 +    if (na > 2)
 +    {
 +        calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +        _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
 +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
 +    }
 +
 +    _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
 +                                        _mm_load_ps(bbj+NNBSBB_B)));
 +    _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
 +                                        _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
 +}
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_8BB] = R2F_D(xl);
 +    bb[1*STRIDE_8BB] = R2F_D(yl);
 +    bb[2*STRIDE_8BB] = R2F_D(zl);
 +    bb[3*STRIDE_8BB] = R2F_U(xh);
 +    bb[4*STRIDE_8BB] = R2F_U(yh);
 +    bb[5*STRIDE_8BB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
 +static void calc_bounding_box_sse(int na,const float *x,float *bb)
 +{
 +    __m128 bb_0_SSE,bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int  i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for(i=1; i<na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
 +    }
 +
 +    _mm_store_ps(bb  ,bb_0_SSE);
 +    _mm_store_ps(bb+4,bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na,const float *x,
 +                                       float *bb_work,
 +                                       real *bb)
 +{
 +    calc_bounding_box_sse(na,x,bb_work);
 +
 +    bb[0*STRIDE_8BB] = bb_work[BBL_X];
 +    bb[1*STRIDE_8BB] = bb_work[BBL_Y];
 +    bb[2*STRIDE_8BB] = bb_work[BBL_Z];
 +    bb[3*STRIDE_8BB] = bb_work[BBU_X];
 +    bb[4*STRIDE_8BB] = bb_work[BBU_Y];
 +    bb[5*STRIDE_8BB] = bb_work[BBU_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* Combines pairs of consecutive bounding boxes */
 +static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
 +{
 +    int    i,j,sc2,nc2,c2;
 +    __m128 min_SSE,max_SSE;
 +
 +    for(i=0; i<grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for(c2=sc2; c2<sc2+nc2; c2++)
 +        {
 +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
 +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
 +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
 +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for(j=0; j<NNBSBB_C; j++)
 +            {
 +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
 +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
 +            }
 +        }
 +    }
 +}
 +
 +#endif
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t *grid)
 +{
 +    int  c,d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for(c=0; c<grid->nc; c++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t *grid)
 +{
 +    int  ns,c,s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for(c=0; c<grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
 +        {
 +            int cs_w,i,d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
 +            for(i=0; i<STRIDE_8BB; i++)
 +            {
 +                for(d=0; d<DIM; d++)
 +                {
 +                    ba[d] +=
 +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
 +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
 +                }
 +            }
 +        }
 +#else
 +        for(s=0; s<grid->nsubc[c]; s++)
 +        {
 +            int cs,d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for(d=0; d<DIM; d++)
 +            {
 +                ba[d] +=
 +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
 +                    grid->bb[cs*NNBSBB_B         +d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
 +                int a0,int a1,const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int subc,s,a,n1,n2,a_lj_max,i,j;
 +    int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for(s=a0; s<a1; s+=na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1 = 0;
 +        n2 = 0;
 +        haveQ = FALSE;
 +        a_lj_max = -1;
 +        for(a=s; a<min(s+na_c,a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for(i=0; i<n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for(j=0; j<n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0,int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx,int sy, int sz,
 +               float *bb_work)
 +{
 +    int    na,a;
 +    size_t offset;
 +    float  *bb_ptr;
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for(a=a0; a<a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
 +                           nbat->XFormat,nbat->x,a0,
 +                           sx,sy,sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +                             */
 +        bb_ptr =
 +            grid->bb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
 +                                       bb_work,bb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                                   bb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
 +                    bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
 +                    bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +
 +        calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start,int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz,ncz,cfilled,c;
 +    int  na,ash,ind,a;
 +    int  na_c,ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for(cz=0; cz<ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz ;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc,na-(ash_c-ash));
 +
 +            fill_cell(nbs,grid,nbat,
 +                      ash_c,ash_c+na_c,atinfo,x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
 +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
 +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0,int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start,int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz=-1,c=-1,ncz;
 +    int  na,ash,na_c,ind,a;
 +    int  subdiv_z,sub_z,na_z,ash_z;
 +    int  subdiv_y,sub_y,na_y,ash_y;
 +    int  subdiv_x,sub_x,na_x,ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    float bb_work_array[NNBSBB_B+3],*bb_work_align;
 +
 +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z,na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz ;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc,na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY,(sub_z & 1),
 +                       nbs->a+ash_z,na_z,x,
 +                       grid->c0[YY]+cy*grid->sy,grid->inv_sy,
 +                       subdiv_y*SGSF,sort_work);
 +#endif
 +
 +            for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y,na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y,na_y,x,
 +                           grid->c0[XX]+cx*grid->sx,grid->inv_sx,
 +                           subdiv_x*SGSF,sort_work);
 +#endif
 +
 +                for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x,na-(ash_x-ash));
 +
 +                    fill_cell(nbs,grid,nbat,
 +                              ash_x,ash_x+na_x,atinfo,x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
 +                              bb_work_align);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                rvec *x,const int *move,
 +                                int thread,int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    for(i=n0; i<n1; i++)
 +    {
 +        if (move == NULL || move[i] >= 0)
 +        {
 +            /* We need to be careful with rounding,
 +             * particles might be a few bits outside the local box.
 +             * The int cast takes care of the lower bound,
 +             * we need to explicitly take care of the upper bound.
 +             */
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            if (cx == grid->ncx)
 +            {
 +                cx = grid->ncx - 1;
 +            }
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +            if (cy == grid->ncy)
 +            {
 +                cy = grid->ncy - 1;
 +            }
 +            /* For the moment cell contains only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
 +            {
 +                gmx_fatal(FARGS,
 +                          "grid cell cx %d cy %d out of range (max %d %d)\n"
 +                          "atom %f %f %f, grid->c0 %f %f",
 +                          cx,cy,grid->ncx,grid->ncy,
 +                          x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            /* Put this moved particle after the end of the grid,
 +             * so we can process it later without using conditionals.
 +             */
 +            cell[i] = grid->ncx*grid->ncy;
 +        }
 +
 +        cxy_na[cell[i]]++;
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0,int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy,cxy,ncz_max,ncz;
 +    int  nthread,thread;
 +    int  *cxy_na,cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
 +                            nbs->cell,nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max = 0;
 +    ncz = 0;
 +    grid->cxy_ind[0] = 0;
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for(thread=1; thread<nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc,grid->na_c,grid->nc,
 +                grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for(cy=0; cy<grid->ncy; cy++)
 +            {
 +                for(cx=0; cx<grid->ncx; cx++)
 +                {
 +                    fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug,"\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for(thread=0; thread<nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for(i=a0; i<a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    /* Set the cell indices for the moved particles */
 +    n0 = grid->nc*grid->na_sc;
 +    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +    for(i=n0; i<n1; i++)
 +    {
 +        nbs->cell[nbs->a[i]] = i;
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for(thread=0; thread<nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb);
 +    }
 +#endif
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for(i=0; i<grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug,nbs,grid);
 +        }
 +        else
 +        {
 +            fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug,nbs,grid);
 +        }
 +    }
 +}
 +
 +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 +                              int natoms)
 +{
 +    int b;
 +
 +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 +    if (flags->nflag > flags->flag_nalloc)
 +    {
 +        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag,flags->flag_nalloc);
 +    }
 +    for(b=0; b<flags->nflag; b++)
 +    {
 +        flags->flag[b] = 0;
 +    }
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC,matrix box,
 +                       int dd_zone,
 +                       rvec corner0,rvec corner1,
 +                       int a0,int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved,int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int n;
 +    int nc_max_grid,nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box,nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
 +                                   nbs->grid[0].atom_density,
 +                                   nbat->XFormat);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell,nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a,nbs->a_nalloc);
 +    }
 +
 +    /* We need padding up to a multiple of the buffer flag size: simply add */
 +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
 +        nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                int nb_kernel_type,
 +                                nbnxn_atomdata_t *nbat)
 +{
 +    int  zone,d;
 +    rvec c0,c1;
 +
 +    for(zone=1; zone<zones->n; zone++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
 +                          zone,c0,c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0,NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    float *bbcz,*bb;
 +    int ncd,sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
 +        srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
 +        srenew(grid->flags_simple,grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(sc=0; sc<grid->nc; sc++)
 +    {
 +        int c,tx,na;
 +
 +        for(c=0; c<ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                case nbatX4:
 +                    /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                    calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                case nbatX8:
 +                    /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                    calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                default:
 +                    calc_bounding_box(na,nbat->xstride,
 +                                      nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
 +                                      bb+tx*NNBSBB_B);
 +                    break;
 +                }
 +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
 +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb_simple);
 +    }
 +#endif
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int ao,cx,cy,cxy,cz,j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for(cx=0; cx<grid->ncx; cx++)
 +    {
 +        for(cy=0; cy<grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for(cz=0; cz<grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0,real b1,
 +                           int nc,real c0,real s,real invs,
 +                           real d2,real r2,int *cf,int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs),0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs),nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0,float bx1,float by0,
 +                       float by1,float bz0,float bz1,
 +                       const float *bb)
 +{
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    d2 = 0;
 +
 +    dl  = bx0 - bb[BBU_X];
 +    dh  = bb[BBL_X] - bx1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = by0 - bb[BBU_Y];
 +    dh  = bb[BBL_Y] - by1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bz0 - bb[BBU_Z];
 +    dh  = bb[BBL_Z] - bz1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
 +static float subc_bb_dist2(int si,const float *bb_i_ci,
 +                           int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    d2 = 0;
 +
 +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
 +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
 +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
 +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
 +static float subc_bb_dist2_sse(int na_c,
 +                              int si,const float *bb_i_ci,
 +                              int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +
 +    __m128 bb_i_SSE0,bb_i_SSE1;
 +    __m128 bb_j_SSE0,bb_j_SSE1;
 +    __m128 dl_SSE;
 +    __m128 dh_SSE;
 +    __m128 dm_SSE;
 +    __m128 dm0_SSE;
 +    __m128 d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float d2_array[7],*d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    bb_i_SSE0 = _mm_load_ps(bb_i);
 +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
 +    bb_j_SSE0 = _mm_load_ps(bb_j);
 +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
 +
 +    _mm_store_ps(d2_align,d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
 +
 +    _mm_store_ss(&d2,d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
 +{                                                \
 +    int    shi;                                  \
 +                                                 \
 +    __m128 dx_0,dy_0,dz_0;                       \
 +    __m128 dx_1,dy_1,dz_1;                       \
 +                                                 \
 +    __m128 mx,my,mz;                             \
 +    __m128 m0x,m0y,m0z;                          \
 +                                                 \
 +    __m128 d2x,d2y,d2z;                          \
 +    __m128 d2s,d2t;                              \
 +                                                 \
 +    shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
 +    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
 +    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
 +    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
 +    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
 +    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
 +                                                 \
 +    dx_0 = _mm_sub_ps(xi_l,xj_h);                \
 +    dy_0 = _mm_sub_ps(yi_l,yj_h);                \
 +    dz_0 = _mm_sub_ps(zi_l,zj_h);                \
 +                                                 \
 +    dx_1 = _mm_sub_ps(xj_l,xi_h);                \
 +    dy_1 = _mm_sub_ps(yj_l,yi_h);                \
 +    dz_1 = _mm_sub_ps(zj_l,zi_h);                \
 +                                                 \
 +    mx   = _mm_max_ps(dx_0,dx_1);                \
 +    my   = _mm_max_ps(dy_0,dy_1);                \
 +    mz   = _mm_max_ps(dz_0,dz_1);                \
 +                                                 \
 +    m0x  = _mm_max_ps(mx,zero);                  \
 +    m0y  = _mm_max_ps(my,zero);                  \
 +    m0z  = _mm_max_ps(mz,zero);                  \
 +                                                 \
 +    d2x  = _mm_mul_ps(m0x,m0x);                  \
 +    d2y  = _mm_mul_ps(m0y,m0y);                  \
 +    d2z  = _mm_mul_ps(m0z,m0z);                  \
 +                                                 \
 +    d2s  = _mm_add_ps(d2x,d2y);                  \
 +    d2t  = _mm_add_ps(d2s,d2z);                  \
 +                                                 \
 +    _mm_store_ps(d2+si,d2t);                     \
 +}
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi,const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l,yj_l,zj_l;
 +    __m128 xj_h,yj_h,zj_h;
 +    __m128 xi_l,yi_l,zi_l;
 +    __m128 xi_h,yi_h,zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
 +    if (STRIDE_8BB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si,const real *x_i,
 +                                int csj,int stride,const real *x_j,
 +                                real rl2)
 +{
 +    int  i,j,i0,j0;
 +    real d2;
 +
 +    for(i=0; i<na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for(j=0; j<na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                 sqr(x_i[i0+1] - x_j[j0+1]) +
 +                 sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si,const real *x_i,
 +                                   int csj,int stride,const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0,iy_SSE0,iz_SSE0;
 +    __m128 ix_SSE1,iy_SSE1,iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int na_c_sse;
 +    int j0,j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
 +    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
 +    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
 +    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
 +    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
 +    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
 +    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE,jy0_SSE,jz0_SSE;
 +        __m128 jx1_SSE,jy1_SSE,jz1_SSE;
 +
 +        __m128 dx_SSE0,dy_SSE0,dz_SSE0;
 +        __m128 dx_SSE1,dy_SSE1,dz_SSE1;
 +        __m128 dx_SSE2,dy_SSE2,dz_SSE2;
 +        __m128 dx_SSE3,dy_SSE3,dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind>>2].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int nsupercell)
 +{
 +    int ncj4_max,j4,j,w,t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc,nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for(t=0; t<WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INT_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool bSimple,
 +                                nbnxn_alloc_t *alloc,
 +                                nbnxn_free_t  *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl,1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work,1);
 +#ifdef NBNXN_BBXXXX
-     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
- #endif
-     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
- #ifdef NBNXN_SEARCH_SSE
-     snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
- #ifdef GMX_X86_AVX_256
-     snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
++    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
 +#else
-     snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
++    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
 +#endif
++    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
++#ifdef GMX_NBNXN_SIMD
++    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
++    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
 +#endif
- #ifdef NBNXN_SEARCH_SSE
++    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
 +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 +                  nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl,nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for(i=0; i<nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i],1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int cs[SHIFTS];
 +    int s,i,j;
 +    int npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nci %d ncj %d\n",
 +            nbl->nci,nbl->ncj);
 +    fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int i,j4,j,si,b;
 +    int c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
 +    fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp,"nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/(0.25*nbl->ncj4));
 +
 +    for(si=0; si<=GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<4; j++)
 +            {
 +                b = 0;
 +                for(si=0; si<GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for(b=0; b<=GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
 +/* Print the full pair list, used for debug output */
 +static void print_supersub_nsp(const char *fn,
 +                               const nbnxn_pairlist_t *nbl,
 +                               int iloc)
 +{
 +    char buf[STRLEN];
 +    FILE *fp;
 +    int i,nsp,j4,p;
 +
 +    sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
 +    fp = ffopen(buf,"w");
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        nsp = 0;
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
 +            {
 +                nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
 +            }
 +        }
 +        fprintf(fp,"%4d %3d %3d\n",
 +                i,
 +                nsp,
 +                nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
 +    }
 +
 +    fclose(fp);
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
 +                                   int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
 +                                 int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl,1);
 +    }
 +    low_get_nbl_exclusions(nbl,cj4,warp,excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl,2);
 +
 +    low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
 +    low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind,int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int  ei,ej,w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for(ej=0; ej<nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for(ei=ej; ei<nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL+si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +}
 +
- #ifdef GMX_X86_AVX_256
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 +static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
 +             NBNXN_INT_MASK_ALL));
 +#endif
 +}
 +
- #else              /* cj-size = 2 */
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 +static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 8 */
 +    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
 +             NBNXN_INT_MASK_ALL));
- #endif /* NBNXN_SEARCH_SSE */
++#else              /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#endif
 +}
++
++#ifdef GMX_NBNXN_SIMD
++#if GMX_NBNXN_SIMD_BITWIDTH == 128
++#define get_imask_x86_simd_4xn  get_imask_x86_simd128
++#else
++#if GMX_NBNXN_SIMD_BITWIDTH == 256
++#define get_imask_x86_simd_4xn  get_imask_x86_simd256
++#define get_imask_x86_simd_2xnn get_imask_x86_simd128
++#else
++#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
++#endif
++#endif
 +#endif
- #ifdef NBNXN_SEARCH_SSE
- /* Include make_cluster_list_x86_simd128/256 */
- #define GMX_MM128_HERE
- #include "gmx_x86_simd_macros.h"
- #define STRIDE_S  PACK_X4
- #include "nbnxn_search_x86_simd.h"
- #undef STRIDE_S
- #undef GMX_MM128_HERE
- #ifdef GMX_X86_AVX_256
- /* Include make_cluster_list_x86_simd128/256 */
- #define GMX_MM256_HERE
- #include "gmx_x86_simd_macros.h"
- #define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
- #include "nbnxn_search_x86_simd.h"
- #undef STRIDE_S
- #undef GMX_MM256_HERE
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci,int cjf,int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2,float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
 +    const float *bb_ci;
 +    const real  *x_ci;
 +
 +    gmx_bool   InRange;
 +    real       d2;
 +    int        cjf_gl,cjl_gl,cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for(cj=cjf; cj<=cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
-                                     case nbk4x4_PlainC:
++#ifdef GMX_NBNXN_SIMD_4XN
++#include "nbnxn_search_simd_4xn.h"
 +#endif
++#ifdef GMX_NBNXN_SIMD_2XNN
++#include "nbnxn_search_simd_2xnn.h"
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 +                                       const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci,int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride,const real *x,
 +                                       real rl2,float rbb2,
 +                                       int *ndistc)
 +{
 +    int  na_c;
 +    int  npair;
 +    int  cjo,ci1,ci,cj,cj_gl;
 +    int  cj4_ind,cj_offset;
 +    unsigned imask;
 +    nbnxn_cj4_t *cj4;
 +    const float *bb_ci;
 +    const real *x_ci;
 +    float *d2l,d2;
 +    int  w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last=-1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> 2);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
 +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
 +                               ci1,bb_ci,d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for(ci=0; ci<GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
 +                (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_8BB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                                (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t *nbl,
 +                             gmx_bool diagRemoved,
 +                             int na_ci_2log,
 +                             int na_cj_2log,
 +                             const nbnxn_ci_t *nbl_ci,
 +                             const t_blocka *excl)
 +{
 +    const int *cell;
 +    int ci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t *nbl,
 +                              gmx_bool diagRemoved,
 +                              int na_c_2log,
 +                              const nbnxn_sci_t *nbl_sci,
 +                              const t_blocka *excl)
 +{
 +    const int *cell;
 +    int na_c;
 +    int sci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e,w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl,cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl,cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
 +
 +                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
 +
 +                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODI
 +#undef A2CI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
 +                         nbnxn_list_work_t *work)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl,nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
 +                          nbnxn_list_work_t *work)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl,nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew,j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj,work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for(j=0; j<ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
 +    {
 +        for(j=0; j<ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INT_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for(j=0; j<ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 +
 +        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
 + * As we only now the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
 + * both on nthread and our own thread index thread.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av,gmx_bool progBal,int nc_bal,
 +                            int thread,int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start,cj4_end,j4len,cj4;
 +    int sci;
 +    int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
 +    int p;
 +
 +    /* Estimate the total numbers of ci's of the nblist combined
 +     * over all threads using the target number of ci's.
 +     */
 +    nsci_est = nc_bal*thread/nthread + nbl->nsci;
 +    if (progBal)
 +    {
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
 +        cj4        = cj4_start;
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
 +        while (cj4 < cj4_end)
 +        {
 +            nsp_cj4_p = nsp_cj4;
 +            nsp_cj4   = 0;
 +            for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
 +            nsp += nsp_cj4;
 +
 +            if (nsp > nsp_max && nsp > nsp_cj4)
 +            {
 +                nbl->sci[sci].cj4_ind_end = cj4;
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl,nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
 +                nsp_sci   = nsp - nsp_cj4;
 +                nsp_cj4_e = nsp_cj4_p;
 +                nsp       = nsp_cj4;
 +            }
 +
 +            cj4++;
 +        }
 +
 +        /* Put the remaining cj4's in a new ci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
 +        /* Possibly balance out the last two ci's
 +         * by moving the last cj4 of the second last ci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
 +        sci++;
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal,int nc_bal,
 +                                    int thread,int nthread)
 +{
 +    int j4len,tlen;
 +    int nb,b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
 +            split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
 +static void set_icell_bb_simple(const float *bb,int ci,
 +                                real shx,real shy,real shz,
 +                                float *bb_ci)
 +{
 +    int ia;
 +
 +    ia = ci*NNBSBB_B;
 +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
 +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
 +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
 +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
 +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
 +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
 +}
 +
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
 +static void set_icell_bb_supersub(const float *bb,int ci,
 +                                  real shx,real shy,real shz,
 +                                  float *bb_ci)
 +{
 +    int ia,m,i;
 +
 +#ifdef NBNXN_BBXXXX
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
 +    for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
 +    {
 +        for(i=0; i<STRIDE_8BB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
 +            bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
 +            bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
 +            bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
 +            bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
 +            bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
 +        }
 +    }
 +#else
 +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
 +    for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
 +    {
 +        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
 +        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
 +        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
 +        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
 +        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
 +        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
 +    }
 +#endif
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx,real shy,real shz,
 +                               int na_c,
 +                               int stride,const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx,real shy,real shz,
 +                                 int na_c,
 +                                 int stride,const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for(i=0; i<GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx,real shy,real shz,
 +                                      int na_c,
 +                                      int stride,const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si,io,ia,i,j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for(si=0; si<GPU_NSUBCELL; si++)
 +    {
 +        for(i=0; i<na_c; i+=STRIDE_8BB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for(j=0; j<STRIDE_8BB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
 +{
 +    int  z,d;
 +    real cl,ca,za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for(z=0; z<zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int iloc,
 +                            real rlist,
 +                            int min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones,ls,r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est,nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
 +
 +        /* Since the target value is a maximum (this avoid high outliers,
 +         * which lead to load imbalance), not average, we get more lists
 +         * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
 +         * But more importantly, the optimal GPU performance moves
 +         * to lower number of block for very small blocks.
 +         * To compensate we add the maximum pair count per cj4.
 +         */
 +        nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est,nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j;
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci,nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp,"  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j4,j;
 +
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci,nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<4; j++)
 +            {
 +                fprintf(fp,"  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
 +            }
 +        }
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci,ncj4,nexcl;
 +    int n,i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for(i=0; i<nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc,nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc,nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc,nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(n=0; n<nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i,j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for(i=0; i<n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for(i=0; i<nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for(n=0; n<nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth,int ci_block,
 +                        int *ci_x,int *ci_y,
 +                        int *ci_b,int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real rlist,
 +                                        gmx_bool simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx,bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
 +                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum = 5;
 +    const int ci_block_denom = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +    
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th,int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
 +    ivec shp;
 +    int  tx,ty,tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx,shy,shz;
 +    int  conv_i,cell0_i;
 +    const float *bb_i,*bbcz_i,*bbcz_j;
 +    const int *flags_i;
 +    real bx0,bx1,by0,by1,bz0,bz1;
 +    real bz1_frac;
 +    real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
 +    int  cxf,cxl,cyf,cyf_x,cyl;
 +    int  cx,cy;
 +    int  c0,c1,cs,cf,cl;
 +    int  ndistc;
 +    int  ncpcheck;
 +    int  gridi_flag_shift=0,gridj_flag_shift=0;
 +    unsigned *gridj_flag=NULL;
 +    int  ncj_old_i,ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
 +        /* Determine conversion of clusters to flag blocks */
 +        gridi_flag_shift = 0;
 +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridi_flag_shift++;
 +        }
 +        gridj_flag_shift = 0;
 +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
 +
 +        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box,box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for(d=0; d<DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
 +        bb_i    = gridi->bb;
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
 +    }
 +
 +    ndistc = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
 +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty=-shp[YY]; ty<=shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
 +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
 +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0,by1,
 +                               gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
 +                               d2z_cx,rl2,
 +                               &cyf,&cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx=-shp[XX]; tx<=shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx,ty,tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
 +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0,bx1,
 +                                   gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
 +                                   d2z_cy,rl2,
 +                                   &cxf,&cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                     nbl->work);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                      nbl->work);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i,ci,shx,shy,shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
 +                        set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
 +                                              nbl->work->bb_ci);
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
 +                                     gridi->na_c,nbat->xstride,nbat->x,
 +                                     nbl->work);
 +
 +                    for(cx=cxf; cx<=cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for(cy=cyf_x; cy<=cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while(cf > c0 &&
 +                                      (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                       d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while(cl < c1-1 &&
 +                                      (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                       d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for(k=c0; k<c1; k++)
 +                                    {
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf,ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf,ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
- #ifdef NBNXN_SEARCH_SSE
-                                     case nbk4xN_X86_SIMD128:
++                                    case nbnxnk4x4_PlainC:
 +                                        check_subcell_list_space_simple(nbl,cl-cf+1);
 +
 +                                        make_cluster_list_simple(gridj,
 +                                                                 nbl,ci,cf,cl,
 +                                                                 (gridi == gridj && shift == CENTRAL),
 +                                                                 nbat->x,
 +                                                                 rl2,rbb2,
 +                                                                 &ndistc);
 +                                        break;
-                                         make_cluster_list_x86_simd128(gridj,
-                                                                       nbl,ci,cf,cl,
-                                                                       (gridi == gridj && shift == CENTRAL),
-                                                                       nbat->x,
-                                                                       rl2,rbb2,
-                                                                       &ndistc);
++#ifdef GMX_NBNXN_SIMD_4XN
++                                    case nbnxnk4xN_SIMD_4xN:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- #ifdef GMX_X86_AVX_256
-                                     case nbk4xN_X86_SIMD256:
++                                        make_cluster_list_simd_4xn(gridj,
++                                                                   nbl,ci,cf,cl,
++                                                                   (gridi == gridj && shift == CENTRAL),
++                                                                   nbat->x,
++                                                                   rl2,rbb2,
++                                                                   &ndistc);
 +                                        break;
-                                         make_cluster_list_x86_simd256(gridj,
-                                                                       nbl,ci,cf,cl,
-                                                                       (gridi == gridj && shift == CENTRAL),
-                                                                       nbat->x,
-                                                                       rl2,rbb2,
-                                                                       &ndistc);
++#endif
++#ifdef GMX_NBNXN_SIMD_2XNN
++                                    case nbnxnk4xN_SIMD_2xNN:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
- #endif
-                                     case nbk8x8x8_PlainC:
-                                     case nbk8x8x8_CUDA:
++                                        make_cluster_list_simd_2xnn(gridj,
++                                                                   nbl,ci,cf,cl,
++                                                                   (gridi == gridj && shift == CENTRAL),
++                                                                   nbat->x,
++                                                                   rl2,rbb2,
++                                                                   &ndistc);
 +                                        break;
 +#endif
- #ifdef NBNXN_SEARCH_SSE
-         case nbk4xN_X86_SIMD128:
-             nbs->icell_set_x = icell_set_x_x86_simd128;
-             break;
- #ifdef GMX_X86_AVX_256
-         case nbk4xN_X86_SIMD256:
-             nbs->icell_set_x = icell_set_x_x86_simd256;
++                                    case nbnxnk8x8x8_PlainC:
++                                    case nbnxnk8x8x8_CUDA:
 +                                        check_subcell_list_space_supersub(nbl,cl-cf+1);
 +                                        for(cj=cf; cj<=cl; cj++)
 +                                        {
 +                                            make_cluster_list_supersub(nbs,gridi,gridj,
 +                                                                       nbl,ci,cj,
 +                                                                       (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                       nbat->xstride,nbat->x,
 +                                                                       rl2,rbb2,
 +                                                                       &ndistc);
 +                                        }
 +                                        break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf,cbl,cb;
 +
 +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
 +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for(cb=cbf; cb<=cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal,min_ci_balanced,
 +                                                th,nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
 +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"number of distance checks %d\n",ndistc);
 +        fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl,nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
 +        }
 +
 +    }
 +}
 +
 +static void reduce_buffer_flags(const nbnxn_search_t nbs,
 +                                int nsrc,
 +                                const nbnxn_buffer_flags_t *dest)
 +{
 +    int s,b;
 +    const unsigned *flag;
 +
 +    for(s=0; s<nsrc; s++)
 +    {
 +        flag = nbs->work[s].buffer_flags.flag;
 +
 +        for(b=0; b<dest->nflag; b++)
 +        {
 +            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
 +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
 +{
 +    int nelem,nkeep,ncopy,nred,b,c,out;
 +
 +    nelem = 0;
 +    nkeep = 0;
 +    ncopy = 0;
 +    nred  = 0;
 +    for(b=0; b<flags->nflag; b++)
 +    {
 +        if (flags->flag[b] == 1)
 +        {
 +            /* Only flag 0 is set, no copy of reduction required */
 +            nelem++;
 +            nkeep++;
 +        }
 +        else if (flags->flag[b] > 0)
 +        {
 +            c = 0;
 +            for(out=0; out<nout; out++)
 +            {
 +                if (flags->flag[b] & (1U<<out))
 +                {
 +                    c++;
 +                }
 +            }
 +            nelem += c;
 +            if (c == 1)
 +            {
 +                ncopy++;
 +            }
 +            else
 +            {
 +                nred += c;
 +            }
 +        }
 +    }
 +
 +    fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
 +            flags->nflag,nout,
 +            nelem/(double)(flags->nflag),
 +            nkeep/(double)(flags->nflag),
 +            ncopy/(double)(flags->nflag),
 +            nred/(double)(flags->nflag));
 +}
 +
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t nbs,
 +                         nbnxn_atomdata_t *nbat,
 +                         const t_blocka *excl,
 +                         real rlist,
 +                         int min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int iloc,
 +                         int nb_kernel_type,
 +                         t_nrnb *nrnb)
 +{
 +    nbnxn_grid_t *gridi,*gridj;
 +    int nzi,zi,zj0,zj1,zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
 +    gmx_bool CombineNBLists;
 +    int np_tot,np_noq,np_hlj,nap;
 +
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns making %d nblists\n", nnbl);
 +    }
 +
 +    nbat->bUseBufferFlags = (nbat->nout > 1);
 +    if (nbat->bUseBufferFlags && LOCAL_I(iloc))
 +    {
 +        init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
 +    }
 +
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
++#ifdef GMX_NBNXN_SIMD_4XN
++        case nbnxnk4xN_SIMD_4xN:
++            nbs->icell_set_x = icell_set_x_simd_4xn;
 +            break;
 +#endif
++#ifdef GMX_NBNXN_SIMD_2XNN
++        case nbnxnk4xN_SIMD_2xNN:
++            nbs->icell_set_x = icell_set_x_simd_2xnn;
++            break;
 +#endif
 +        default:
 +            nbs->icell_set_x = icell_set_x_simple;
 +            break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for(th=0; th<nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for(zi=0; zi<nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for(zj=zj0; zj<zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
 +            }
 +            else
 +            {
 +                ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for(th=0; th<nnbl; th++)
 +            {
 +                if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
 +                {
 +                    init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
 +                }
 +
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs,gridi,gridj,
 +                                         &nbs->work[th],nbat,excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
 +                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
 +                                         (LOCAL_I(iloc) || nbs->zones->n <= 2),
 +                                         min_ci_balanced,
 +                                         th,nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for(th=0; th<nnbl; th++)
 +            {
 +                inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1,nbl+1,nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
 +        }
 +    }
 +
 +    if (nbat->bUseBufferFlags)
 +    {
 +        reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
 +    }
 +
 +    /*
 +    print_supersub_nsp("nsubpair",nbl[0],iloc);
 +    */
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr,nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (gmx_debug_at)
 +        {
 +            if (nbl[0]->bSimple)
 +            {
 +                print_nblist_ci_cj(debug,nbl[0]);
 +            }
 +            else
 +            {
 +                print_nblist_sci_cj(debug,nbl[0]);
 +            }
 +        }
 +
 +        if (nbat->bUseBufferFlags)
 +        {
 +            print_reduction_cost(&nbat->buffer_flags,nnbl);
 +        }
 +    }
 +}
index 0000000000000000000000000000000000000000,04dd50105d212827f568d11f7b38cac2b5056c2a..04dd50105d212827f568d11f7b38cac2b5056c2a
mode 000000,100644..100644
--- /dev/null
index 86dbc9e58d2288dc70e8a776cf1bb753f7eae23e,0000000000000000000000000000000000000000..7e1a7fc21b56e513fe66ec5f1796ac775e4d3088
mode 100644,000000..100644
--- /dev/null
@@@ -1,2648 -1,0 +1,2648 @@@
- #include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
- #include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_CRAY_XT3
 +#include<catamount/dclock.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +#include <math.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "gmxfio.h"
 +#include "smalloc.h"
 +#include "names.h"
 +#include "confio.h"
 +#include "mvdata.h"
 +#include "txtdump.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "vec.h"
 +#include <time.h>
 +#include "nrnb.h"
 +#include "mshift.h"
 +#include "mdrun.h"
 +#include "sim_util.h"
 +#include "update.h"
 +#include "physics.h"
 +#include "main.h"
 +#include "mdatoms.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "pme.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "network.h"
 +#include "calcmu.h"
 +#include "constr.h"
 +#include "xvgr.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "copyrite.h"
 +#include "pull_rotation.h"
 +#include "gmx_random.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "gmx_wallcycle.h"
 +#include "genborn.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-     if (nbvg->kernel_type != nbk8x8x8_CUDA)
++#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
++#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
 +#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#include "adress.h"
 +#include "qmmm.h"
 +
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "nbnxn_cuda/nbnxn_cuda.h"
 +
 +#if 0
 +typedef struct gmx_timeprint {
 +
 +} t_gmx_timeprint;
 +#endif
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +double
 +gmx_gettime()
 +{
 +#ifdef HAVE_GETTIMEOFDAY
 +      struct timeval t;
 +      double seconds;
 +
 +      gettimeofday(&t,NULL);
 +
 +      seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
 +
 +      return seconds;
 +#else
 +      double  seconds;
 +
 +      seconds = time(NULL);
 +
 +      return seconds;
 +#endif
 +}
 +
 +
 +#define difftime(end,start) ((double)(end)-(double)(start))
 +
 +void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,
 +                t_inputrec *ir, t_commrec *cr)
 +{
 +    time_t finish;
 +    char   timebuf[STRLEN];
 +    double dt;
 +    char buf[48];
 +
 +#ifndef GMX_THREAD_MPI
 +    if (!PAR(cr))
 +#endif
 +    {
 +        fprintf(out,"\r");
 +    }
 +    fprintf(out,"step %s",gmx_step_str(step,buf));
 +    if ((step >= ir->nstlist))
 +    {
 +        runtime->last = gmx_gettime();
 +        dt = difftime(runtime->last,runtime->real);
 +        runtime->time_per_step = dt/(step - ir->init_step + 1);
 +
 +        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
 +
 +        if (ir->nsteps >= 0)
 +        {
 +            if (dt >= 300)
 +            {
 +                finish = (time_t) (runtime->last + dt);
 +                gmx_ctime_r(&finish,timebuf,STRLEN);
 +                sprintf(buf,"%s",timebuf);
 +                buf[strlen(buf)-1]='\0';
 +                fprintf(out,", will finish %s",buf);
 +            }
 +            else
 +                fprintf(out,", remaining runtime: %5d s          ",(int)dt);
 +        }
 +        else
 +        {
 +            fprintf(out," performance: %.1f ns/day    ",
 +                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        fprintf(out,"\n");
 +    }
 +#endif
 +
 +    fflush(out);
 +}
 +
 +#ifdef NO_CLOCK
 +#define clock() -1
 +#endif
 +
 +static double set_proctime(gmx_runtime_t *runtime)
 +{
 +    double diff;
 +#ifdef GMX_CRAY_XT3
 +    double prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = dclock();
 +
 +    diff = runtime->proc - prev;
 +#else
 +    clock_t prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = clock();
 +
 +    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
 +#endif
 +    if (diff < 0)
 +    {
 +        /* The counter has probably looped, ignore this data */
 +        diff = 0;
 +    }
 +
 +    return diff;
 +}
 +
 +void runtime_start(gmx_runtime_t *runtime)
 +{
 +    runtime->real = gmx_gettime();
 +    runtime->proc          = 0;
 +    set_proctime(runtime);
 +    runtime->realtime      = 0;
 +    runtime->proctime      = 0;
 +    runtime->last          = 0;
 +    runtime->time_per_step = 0;
 +}
 +
 +void runtime_end(gmx_runtime_t *runtime)
 +{
 +    double now;
 +
 +    now = gmx_gettime();
 +
 +    runtime->proctime += set_proctime(runtime);
 +    runtime->realtime  = now - runtime->real;
 +    runtime->real      = now;
 +}
 +
 +void runtime_upd_proc(gmx_runtime_t *runtime)
 +{
 +    runtime->proctime += set_proctime(runtime);
 +}
 +
 +void print_date_and_time(FILE *fplog,int nodeid,const char *title,
 +                         const gmx_runtime_t *runtime)
 +{
 +    int i;
 +    char timebuf[STRLEN];
 +    char time_string[STRLEN];
 +    time_t tmptime;
 +
 +    if (fplog)
 +    {
 +        if (runtime != NULL)
 +        {
 +            tmptime = (time_t) runtime->real;
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        else
 +        {
 +            tmptime = (time_t) gmx_gettime();
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        for(i=0; timebuf[i]>=' '; i++)
 +        {
 +            time_string[i]=timebuf[i];
 +        }
 +        time_string[i]='\0';
 +
 +        fprintf(fplog,"%s on node %d %s\n",title,nodeid,time_string);
 +    }
 +}
 +
 +static void sum_forces(int start,int end,rvec f[],rvec flr[])
 +{
 +  int i;
 +
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"fsr",f+start,end-start);
 +    pr_rvecs(debug,0,"flr",flr+start,end-start);
 +  }
 +  for(i=start; (i<end); i++)
 +    rvec_inc(f[i],flr[i]);
 +}
 +
 +/*
 + * calc_f_el calculates forces due to an electric field.
 + *
 + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
 + *
 + * Et[] contains the parameters for the time dependent
 + * part of the field (not yet used).
 + * Ex[] contains the parameters for
 + * the spatial dependent part of the field. You can have cool periodic
 + * fields in principle, but only a constant field is supported
 + * now.
 + * The function should return the energy due to the electric field
 + * (if any) but for now returns 0.
 + *
 + * WARNING:
 + * There can be problems with the virial.
 + * Since the field is not self-consistent this is unavoidable.
 + * For neutral molecules the virial is correct within this approximation.
 + * For neutral systems with many charged molecules the error is small.
 + * But for systems with a net charge or a few charged molecules
 + * the error can be significant when the field is high.
 + * Solution: implement a self-consitent electric field into PME.
 + */
 +static void calc_f_el(FILE *fp,int  start,int homenr,
 +                      real charge[],rvec x[],rvec f[],
 +                      t_cosines Ex[],t_cosines Et[],double t)
 +{
 +    rvec Ext;
 +    real t0;
 +    int  i,m;
 +
 +    for(m=0; (m<DIM); m++)
 +    {
 +        if (Et[m].n > 0)
 +        {
 +            if (Et[m].n == 3)
 +            {
 +                t0 = Et[m].a[1];
 +                Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
 +            }
 +            else
 +            {
 +                Ext[m] = cos(Et[m].a[0]*t);
 +            }
 +        }
 +        else
 +        {
 +            Ext[m] = 1.0;
 +        }
 +        if (Ex[m].n > 0)
 +        {
 +            /* Convert the field strength from V/nm to MD-units */
 +            Ext[m] *= Ex[m].a[0]*FIELDFAC;
 +            for(i=start; (i<start+homenr); i++)
 +                f[i][m] += charge[i]*Ext[m];
 +        }
 +        else
 +        {
 +            Ext[m] = 0;
 +        }
 +    }
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
 +                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
 +    }
 +}
 +
 +static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
 +                      tensor vir_part,t_graph *graph,matrix box,
 +                      t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
 +{
 +  int i,j;
 +  tensor virtest;
 +
 +  /* The short-range virial from surrounding boxes */
 +  clear_mat(vir_part);
 +  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
 +
 +  /* Calculate partial virial, for local atoms only, based on short range.
 +   * Total virial is computed in global_stat, called from do_md
 +   */
 +  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
 +
 +  /* Add position restraint contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][i] += fr->vir_diag_posres[i];
 +  }
 +
 +  /* Add wall contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][ZZ] += fr->vir_wall_z[i];
 +  }
 +
 +  if (debug)
 +    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
 +}
 +
 +static void posres_wrapper(FILE *fplog,
 +                           int flags,
 +                           gmx_bool bSepDVDL,
 +                           t_inputrec *ir,
 +                           t_nrnb *nrnb,
 +                           gmx_localtop_t *top,
 +                           matrix box,rvec x[],
 +                           rvec f[],
 +                           gmx_enerdata_t *enerd,
 +                           real *lambda,
 +                           t_forcerec *fr)
 +{
 +    t_pbc pbc;
 +    real  v,dvdl;
 +    int   i;
 +
 +    /* Position restraints always require full pbc */
 +    set_pbc(&pbc,ir->ePBC,box);
 +    dvdl = 0;
 +    v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +               top->idef.iparams_posres,
 +               (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 +               ir->ePBC==epbcNONE ? NULL : &pbc,
 +               lambda[efptRESTRAINT],&dvdl,
 +               fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,
 +                interaction_function[F_POSRES].longname,v,dvdl);
 +    }
 +    enerd->term[F_POSRES] += v;
 +    /* If just the force constant changes, the FEP term is linear,
 +     * but if k changes, it is not.
 +     */
 +    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
 +    inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
 +
 +    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 +    {
 +        for(i=0; i<enerd->n_lambda; i++)
 +        {
 +            real dvdl_dum,lambda_dum;
 +
 +            lambda_dum = (i==0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
 +            v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +                       top->idef.iparams_posres,
 +                       (const rvec*)x,NULL,NULL,
 +                       ir->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl,
 +                       fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +            enerd->enerpart_lambda[i] += v;
 +        }
 +    }
 +}
 +
 +static void pull_potential_wrapper(FILE *fplog,
 +                                   gmx_bool bSepDVDL,
 +                                   t_commrec *cr,
 +                                   t_inputrec *ir,
 +                                   matrix box,rvec x[],
 +                                   rvec f[],
 +                                   tensor vir_force,
 +                                   t_mdatoms *mdatoms,
 +                                   gmx_enerdata_t *enerd,
 +                                   real *lambda,
 +                                   double t)
 +{
 +    t_pbc  pbc;
 +    real   dvdl;
 +
 +    /* Calculate the center of mass forces, this requires communication,
 +     * which is why pull_potential is called close to other communication.
 +     * The virial contribution is calculated directly,
 +     * which is why we call pull_potential after calc_virial.
 +     */
 +    set_pbc(&pbc,ir->ePBC,box);
 +    dvdl = 0; 
 +    enerd->term[F_COM_PULL] +=
 +        pull_potential(ir->ePull,ir->pull,mdatoms,&pbc,
 +                       cr,t,lambda[efptRESTRAINT],x,f,vir_force,&dvdl);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
 +    }
 +    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 +}
 +
 +static void pme_receive_force_ener(FILE *fplog,
 +                                   gmx_bool bSepDVDL,
 +                                   t_commrec *cr,
 +                                   gmx_wallcycle_t wcycle,
 +                                   gmx_enerdata_t *enerd,
 +                                   t_forcerec *fr)
 +{
 +    real   e,v,dvdl;    
 +    float  cycles_ppdpme,cycles_seppme;
 +
 +    cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
 +    dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
 +
 +    /* In case of node-splitting, the PP nodes receive the long-range 
 +     * forces, virial and energy from the PME nodes here.
 +     */    
 +    wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
 +    dvdl = 0;
 +    gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
 +                      &cycles_seppme);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
 +    }
 +    enerd->term[F_COUL_RECIP] += e;
 +    enerd->dvdl_lin[efptCOUL] += dvdl;
 +    if (wcycle)
 +    {
 +        dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
 +    }
 +    wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
 +}
 +
 +static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
 +                             gmx_large_int_t step,real pforce,rvec *x,rvec *f)
 +{
 +  int  i;
 +  real pf2,fn2;
 +  char buf[STEPSTRSIZE];
 +
 +  pf2 = sqr(pforce);
 +  for(i=md->start; i<md->start+md->homenr; i++) {
 +    fn2 = norm2(f[i]);
 +    /* We also catch NAN, if the compiler does not optimize this away. */
 +    if (fn2 >= pf2 || fn2 != fn2) {
 +      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 +            gmx_step_str(step,buf),
 +            ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
 +    }
 +  }
 +}
 +
 +static void post_process_forces(FILE *fplog,
 +                                t_commrec *cr,
 +                                gmx_large_int_t step,
 +                                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                                gmx_localtop_t *top,
 +                                matrix box,rvec x[],
 +                                rvec f[],
 +                                tensor vir_force,
 +                                t_mdatoms *mdatoms,
 +                                t_graph *graph,
 +                                t_forcerec *fr,gmx_vsite_t *vsite,
 +                                int flags)
 +{
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite)
 +        {
 +            /* Spread the mesh force on virtual sites to the other particles... 
 +             * This is parallellized. MPI communication is performed
 +             * if the constructing atoms aren't local.
 +             */
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
 +                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
 +                           nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +        }
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Now add the forces, this is local */
 +            if (fr->bDomDec)
 +            {
 +                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
 +            }
 +            else
 +            {
 +                sum_forces(mdatoms->start,mdatoms->start+mdatoms->homenr,
 +                           f,fr->f_novirsum);
 +            }
 +            if (EEL_FULL(fr->eeltype))
 +            {
 +                /* Add the mesh contribution to the virial */
 +                m_add(vir_force,fr->vir_el_recip,vir_force);
 +            }
 +            if (debug)
 +            {
 +                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
 +            }
 +        }
 +    }
 +    
 +    if (fr->print_force >= 0)
 +    {
 +        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
 +    }
 +}
 +
 +static void do_nb_verlet(t_forcerec *fr,
 +                         interaction_const_t *ic,
 +                         gmx_enerdata_t *enerd,
 +                         int flags, int ilocality,
 +                         int clearF,
 +                         t_nrnb *nrnb,
 +                         gmx_wallcycle_t wcycle)
 +{
 +    int     nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 +    char    *env;
 +    nonbonded_verlet_group_t  *nbvg;
 +
 +    if (!(flags & GMX_FORCE_NONBONDED))
 +    {
 +        /* skip non-bonded calculation */
 +        return;
 +    }
 +
 +    nbvg = &fr->nbv->grp[ilocality];
 +
 +    /* CUDA kernel launch overhead is already timed separately */
 +    if (fr->cutoff_scheme != ecutsVERLET)
 +    {
 +        gmx_incons("Invalid cut-off scheme passed!");
 +    }
 +
-         case nbk4x4_PlainC:
++    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +    }
 +    switch (nbvg->kernel_type)
 +    {
-         case nbk4xN_X86_SIMD128:
-             nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
-                                      nbvg->nbat, ic,
-                                      nbvg->ewald_excl,
-                                      fr->shift_vec,
-                                      flags,
-                                      clearF,
-                                      fr->fshift[0],
-                                      enerd->grpp.ener[egCOULSR],
-                                      fr->bBHAM ?
-                                      enerd->grpp.ener[egBHAMSR] :
-                                      enerd->grpp.ener[egLJSR]);
++        case nbnxnk4x4_PlainC:
 +            nbnxn_kernel_ref(&nbvg->nbl_lists,
 +                             nbvg->nbat, ic,
 +                             fr->shift_vec,
 +                             flags,
 +                             clearF,
 +                             fr->fshift[0],
 +                             enerd->grpp.ener[egCOULSR],
 +                             fr->bBHAM ?
 +                             enerd->grpp.ener[egBHAMSR] :
 +                             enerd->grpp.ener[egLJSR]);
 +            break;
 +        
-         case nbk4xN_X86_SIMD256:
-             nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
-                                      nbvg->nbat, ic,
-                                      nbvg->ewald_excl,
-                                      fr->shift_vec,
-                                      flags,
-                                      clearF,
-                                      fr->fshift[0],
-                                      enerd->grpp.ener[egCOULSR],
-                                      fr->bBHAM ?
-                                      enerd->grpp.ener[egBHAMSR] :
-                                      enerd->grpp.ener[egLJSR]);
++        case nbnxnk4xN_SIMD_4xN:
++            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
++                                  nbvg->nbat, ic,
++                                  nbvg->ewald_excl,
++                                  fr->shift_vec,
++                                  flags,
++                                  clearF,
++                                  fr->fshift[0],
++                                  enerd->grpp.ener[egCOULSR],
++                                  fr->bBHAM ?
++                                  enerd->grpp.ener[egBHAMSR] :
++                                  enerd->grpp.ener[egLJSR]);
 +            break;
-         case nbk8x8x8_CUDA:
++        case nbnxnk4xN_SIMD_2xNN:
++            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
++                                   nbvg->nbat, ic,
++                                   nbvg->ewald_excl,
++                                   fr->shift_vec,
++                                   flags,
++                                   clearF,
++                                   fr->fshift[0],
++                                   enerd->grpp.ener[egCOULSR],
++                                   fr->bBHAM ?
++                                   enerd->grpp.ener[egBHAMSR] :
++                                   enerd->grpp.ener[egLJSR]);
 +            break;
 +
-         case nbk8x8x8_PlainC:
++        case nbnxnk8x8x8_CUDA:
 +            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
 +            break;
 +
-     if (nbvg->kernel_type != nbk8x8x8_CUDA)
++        case nbnxnk8x8x8_PlainC:
 +            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 +                                 nbvg->nbat, ic,
 +                                 fr->shift_vec,
 +                                 flags,
 +                                 clearF,
 +                                 nbvg->nbat->out[0].f,
 +                                 fr->fshift[0],
 +                                 enerd->grpp.ener[egCOULSR],
 +                                 fr->bBHAM ?
 +                                 enerd->grpp.ener[egBHAMSR] :
 +                                 enerd->grpp.ener[egLJSR]);
 +            break;
 +
 +        default:
 +            gmx_incons("Invalid nonbonded kernel type passed!");
 +
 +    }
-     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
++    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
 +    {
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +    }
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 +    }
 +    else if (nbvg->ewald_excl == ewaldexclTable)
 +    {
 +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 +    }
 +    else
 +    {
 +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 +    }
 +    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 +    if (flags & GMX_FORCE_ENERGY)
 +    {
 +        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 +        enr_nbnxn_kernel_ljc += 1;
 +        enr_nbnxn_kernel_lj  += 1;
 +    }
 +
 +    inc_nrnb(nrnb,enr_nbnxn_kernel_ljc,
 +             nbvg->nbl_lists.natpair_ljq);
 +    inc_nrnb(nrnb,enr_nbnxn_kernel_lj,
 +             nbvg->nbl_lists.natpair_lj);
 +    inc_nrnb(nrnb,enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 +             nbvg->nbl_lists.natpair_q);
 +}
 +
 +void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr, interaction_const_t *ic,
 +              gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    int     cg0,cg1,i,j;
 +    int     start,homenr;
 +    int     nb_kernel_type;
 +    double  mu[2*DIM];
 +    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
 +    gmx_bool   bDoLongRange,bDoForces,bSepLRF,bUseGPU,bUseOrEmulGPU;
 +    gmx_bool   bDiffKernels=FALSE;
 +    matrix  boxs;
 +    rvec    vzero,box_diag;
 +    real    e,v,dvdl;
 +    float  cycles_pme,cycles_force;
 +    nonbonded_verlet_t *nbv;
 +
 +    cycles_force = 0;
 +    nbv = fr->nbv;
 +    nb_kernel_type = fr->nbv->grp[0].kernel_type;
 +
 +    start  = mdatoms->start;
 +    homenr = mdatoms->homenr;
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
 +
 +    clear_mat(vir_force);
 +
 +    cg0 = 0;
 +    if (DOMAINDECOMP(cr))
 +    {
 +        cg1 = cr->dd->ncg_tot;
 +    }
 +    else
 +    {
 +        cg1 = top->cgs.nr;
 +    }
 +    if (fr->n_tpi > 0)
 +    {
 +        cg1--;
 +    }
 +
 +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
 +    bFillGrid     = (bNS && bStateChanged);
 +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
 +    bDoForces     = (flags & GMX_FORCE_FORCES);
 +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 +    bUseGPU       = fr->nbv->bUseGPU;
-             if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
++    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 +
 +    if (bStateChanged)
 +    {
 +        update_forcerec(fplog,fr,box);
 +
 +        if (NEED_MUTOT(*inputrec))
 +        {
 +            /* Calculate total (local) dipole moment in a temporary common array.
 +             * This makes it possible to sum them over nodes faster.
 +             */
 +            calc_mu(start,homenr,
 +                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
 +                    mu,mu+DIM);
 +        }
 +    }
 +
 +    if (fr->ePBC != epbcNONE) { 
 +        /* Compute shift vectors every step,
 +         * because of pressure coupling or box deformation!
 +         */
 +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 +            calc_shifts(box,fr->shift_vec);
 +
 +        if (bCalcCGCM) { 
 +            put_atoms_in_box_omp(fr->ePBC,box,homenr,x);
 +            inc_nrnb(nrnb,eNR_SHIFTX,homenr);
 +        } 
 +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
 +            unshift_self(graph,box,x);
 +        }
 +    } 
 +
 +    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 +                                  fr->shift_vec,nbv->grp[0].nbat);
 +
 +#ifdef GMX_MPI
 +    if (!(cr->duty & DUTY_PME)) {
 +        /* Send particle coordinates to the pme nodes.
 +         * Since this is only implemented for domain decomposition
 +         * and domain decomposition does not use the graph,
 +         * we do not need to worry about shifting.
 +         */    
 +
 +        wallcycle_start(wcycle,ewcPP_PMESENDX);
 +
 +        bBS = (inputrec->nwall == 2);
 +        if (bBS) {
 +            copy_mat(box,boxs);
 +            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +        }
 +
 +        gmx_pme_send_x(cr,bBS ? boxs : box,x,
 +                       mdatoms->nChargePerturbed,lambda[efptCOUL],
 +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
 +
 +        wallcycle_stop(wcycle,ewcPP_PMESENDX);
 +    }
 +#endif /* GMX_MPI */
 +
 +    /* do gridding for pair search */
 +    if (bNS)
 +    {
 +        if (graph && bStateChanged)
 +        {
 +            /* Calculate intramolecular shift vectors to make molecules whole */
 +            mk_mshift(fplog,graph,fr->ePBC,box,x);
 +        }
 +
 +        clear_rvec(vzero);
 +        box_diag[XX] = box[XX][XX];
 +        box_diag[YY] = box[YY][YY];
 +        box_diag[ZZ] = box[ZZ][ZZ];
 +
 +        wallcycle_start(wcycle,ewcNS);
 +        if (!fr->bDomDec)
 +        {
 +            wallcycle_sub_start(wcycle,ewcsNBS_GRID_LOCAL);
 +            nbnxn_put_on_grid(nbv->nbs,fr->ePBC,box,
 +                              0,vzero,box_diag,
 +                              0,mdatoms->homenr,-1,fr->cginfo,x,
 +                              0,NULL,
 +                              nbv->grp[eintLocal].kernel_type,
 +                              nbv->grp[eintLocal].nbat);
 +            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_LOCAL);
 +        }
 +        else
 +        {
 +            wallcycle_sub_start(wcycle,ewcsNBS_GRID_NONLOCAL);
 +            nbnxn_put_on_grid_nonlocal(nbv->nbs,domdec_zones(cr->dd),
 +                                       fr->cginfo,x,
 +                                       nbv->grp[eintNonlocal].kernel_type,
 +                                       nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_NONLOCAL);
 +        }
 +
 +        if (nbv->ngrp == 1 ||
 +            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
 +        {
 +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatAll,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +        }
 +        else
 +        {
 +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatLocal,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat,eatAll,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +        }
 +        wallcycle_stop(wcycle, ewcNS);
 +    }
 +
 +    /* initialize the GPU atom data and copy shift vector */
 +    if (bUseGPU)
 +    {
 +        if (bNS)
 +        {
 +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 +        }
 +
 +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 +    }
 +
 +    /* do local pair search */
 +    if (bNS)
 +    {
 +        wallcycle_start_nocount(wcycle,ewcNS);
 +        wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_LOCAL);
 +        nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintLocal].nbat,
 +                            &top->excls,
 +                            ic->rlist,
 +                            nbv->min_ci_balanced,
 +                            &nbv->grp[eintLocal].nbl_lists,
 +                            eintLocal,
 +                            nbv->grp[eintLocal].kernel_type,
 +                            nrnb);
 +        wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_LOCAL);
 +
 +        if (bUseGPU)
 +        {
 +            /* initialize local pair-list on the GPU */
 +            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 +                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
 +                                     eintLocal);
 +        }
 +        wallcycle_stop(wcycle, ewcNS);
 +    }
 +    else
 +    {
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,FALSE,x,
 +                                        nbv->grp[eintLocal].nbat);
 +        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +    }
 +
 +    if (bUseGPU)
 +    {
 +        wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
 +        /* launch local nonbonded F on GPU */
 +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 +                     nrnb, wcycle);
 +        wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +    }
 +
 +    /* Communicate coordinates and sum dipole if necessary + 
 +       do non-local pair search */
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
 +                        nbv->grp[eintLocal].kernel_type);
 +
 +        if (bDiffKernels)
 +        {
 +            /* With GPU+CPU non-bonded calculations we need to copy
 +             * the local coordinates to the non-local nbat struct
 +             * (in CPU format) as the non-local kernel call also
 +             * calculates the local - non-local interactions.
 +             */
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,TRUE,x,
 +                                             nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +
 +        if (bNS)
 +        {
 +            wallcycle_start_nocount(wcycle,ewcNS);
 +            wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 +
 +            if (bDiffKernels)
 +            {
 +                nbnxn_grid_add_simple(nbv->nbs,nbv->grp[eintNonlocal].nbat);
 +            }
 +
 +            nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintNonlocal].nbat,
 +                                &top->excls,
 +                                ic->rlist,
 +                                nbv->min_ci_balanced,
 +                                &nbv->grp[eintNonlocal].nbl_lists,
 +                                eintNonlocal,
 +                                nbv->grp[eintNonlocal].kernel_type,
 +                                nrnb);
 +
 +            wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 +
++            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
 +            {
 +                /* initialize non-local pair-list on the GPU */
 +                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 +                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 +                                         eintNonlocal);
 +            }
 +            wallcycle_stop(wcycle,ewcNS);
 +        } 
 +        else
 +        {
 +            wallcycle_start(wcycle,ewcMOVEX);
 +            dd_move_x(cr->dd,box,x);
 +
 +            /* When we don't need the total dipole we sum it in global_stat */
 +            if (bStateChanged && NEED_MUTOT(*inputrec))
 +            {
 +                gmx_sumd(2*DIM,mu,cr);
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEX);
 +
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatNonlocal,FALSE,x,
 +                                            nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +
 +        if (bUseGPU && !bDiffKernels)
 +        { 
 +            wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
 +            /* launch non-local nonbonded F on GPU */
 +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 +                         nrnb, wcycle);
 +            cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +        }
 +    }
 +
 +    if (bUseGPU)
 +    {
 +        /* launch D2H copy-back F */
 +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +        if (DOMAINDECOMP(cr) && !bDiffKernels)
 +        {
 +            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
 +                                      flags, eatNonlocal);
 +        }
 +        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
 +                                  flags, eatLocal);
 +        cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +    }
 +
 +    if (bStateChanged && NEED_MUTOT(*inputrec))
 +    {
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(2*DIM,mu,cr);
 +        } 
 +
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0;j<DIM;j++)
 +            {
 +                fr->mu_tot[i][j] = mu[i*DIM + j];
 +            }
 +        }
 +    }
 +    if (fr->efep == efepNO)
 +    {
 +        copy_rvec(fr->mu_tot[0],mu_tot);
 +    }
 +    else
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            mu_tot[j] =
 +                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
 +                lambda[efptCOUL]*fr->mu_tot[1][j];
 +        }
 +    }
 +
 +    /* Reset energies */
 +    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
 +    clear_rvecs(SHIFTS,fr->fshift);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (!(cr->duty & DUTY_PME))
 +        {
 +            wallcycle_start(wcycle,ewcPPDURINGPME);
 +            dd_force_flop_start(cr->dd,nrnb);
 +        }
 +    }
 +    
 +    /* Start the force cycle counter.
 +     * This counter is stopped in do_forcelow_level.
 +     * No parallel communication should occur while this counter is running,
 +     * since that will interfere with the dynamic load balancing.
 +     */
 +    wallcycle_start(wcycle,ewcFORCE);
 +    if (bDoForces)
 +    {
 +        /* Reset forces for which the virial is calculated separately:
 +         * PME/Ewald forces if necessary */
 +        if (fr->bF_NoVirSum) 
 +        {
 +            if (flags & GMX_FORCE_VIRIAL)
 +            {
 +                fr->f_novirsum = fr->f_novirsum_alloc;
 +                if (fr->bDomDec)
 +                {
 +                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
 +                }
 +                else
 +                {
 +                    clear_rvecs(homenr,fr->f_novirsum+start);
 +                }
 +            }
 +            else
 +            {
 +                /* We are not calculating the pressure so we do not need
 +                 * a separate array for forces that do not contribute
 +                 * to the pressure.
 +                 */
 +                fr->f_novirsum = f;
 +            }
 +        }
 +
 +        /* Clear the short- and long-range forces */
 +        clear_rvecs(fr->natoms_force_constr,f);
 +        if(bSepLRF && do_per_step(step,inputrec->nstcalclr))
 +        {
 +            clear_rvecs(fr->natoms_force_constr,fr->f_twin);
 +        }
 +        
 +        clear_rvec(fr->vir_diag_posres);
 +    }
 +    if (inputrec->ePull == epullCONSTRAINT)
 +    {
 +        clear_pull_forces(inputrec->pull);
 +    }
 +
 +    /* update QMMMrec, if necessary */
 +    if(fr->bQMMM)
 +    {
 +        update_QMMMrec(cr,fr,x,mdatoms,box,top);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    {
 +        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
 +                       f,enerd,lambda,fr);
 +    }
 +
 +    /* Compute the bonded and non-bonded energies and optionally forces */    
 +    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
 +                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
 +                      x,hist,f, bSepLRF ? fr->f_twin : f,enerd,fcd,mtop,top,fr->born,
 +                      &(top->atomtypes),bBornRadii,box,
 +                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
 +                      flags, &cycles_pme);
 +
 +    if(bSepLRF)
 +    {
 +        if (do_per_step(step,inputrec->nstcalclr))
 +        {
 +            /* Add the long range forces to the short range forces */
 +            for(i=0; i<fr->natoms_force_constr; i++)
 +            {
 +                rvec_add(fr->f_twin[i],f[i],f[i]);
 +            }
 +        }
 +    }
 +    
 +    if (!bUseOrEmulGPU)
 +    {
 +        /* Maybe we should move this into do_force_lowlevel */
 +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
 +                     nrnb, wcycle);
 +    }
 +        
 +
 +    if (!bUseOrEmulGPU || bDiffKernels)
 +    {
 +        int aloc;
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
 +                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
 +                         nrnb, wcycle);
 +        }
 +
 +        if (!bUseOrEmulGPU)
 +        {
 +            aloc = eintLocal;
 +        }
 +        else
 +        {
 +            aloc = eintNonlocal;
 +        }
 +
 +        /* Add all the non-bonded force to the normal force array.
 +         * This can be split into a local a non-local part when overlapping
 +         * communication with calculation with domain decomposition.
 +         */
 +        cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatAll,nbv->grp[aloc].nbat,f);
 +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_start_nocount(wcycle,ewcFORCE);
 +
 +        /* if there are multiple fshift output buffers reduce them */
 +        if ((flags & GMX_FORCE_VIRIAL) &&
 +            nbv->grp[aloc].nbl_lists.nnbl > 1)
 +        {
 +            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
 +                                                      fr->fshift);
 +        }
 +    }
 +    
 +    cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +    
 +    if (ed)
 +    {
 +        do_flood(fplog,cr,x,f,ed,box,step,bNS);
 +    }
 +
 +    if (bUseOrEmulGPU && !bDiffKernels)
 +    {
 +        /* wait for non-local forces (or calculate in emulation mode) */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            if (bUseGPU)
 +            {
 +                wallcycle_start(wcycle,ewcWAIT_GPU_NB_NL);
 +                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 +                                    nbv->grp[eintNonlocal].nbat,
 +                                    flags, eatNonlocal,
 +                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                    fr->fshift);
 +                cycles_force += wallcycle_stop(wcycle,ewcWAIT_GPU_NB_NL);
 +            }
 +            else
 +            {
 +                wallcycle_start_nocount(wcycle,ewcFORCE);
 +                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
 +                             nrnb, wcycle);
 +                cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +            }            
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +            /* skip the reduction if there was no non-local work to do */
 +            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
 +            {
 +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatNonlocal,
 +                                               nbv->grp[eintNonlocal].nbat,f);
 +            }
 +            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        /* Communicate the forces */
 +        if (PAR(cr))
 +        {
 +            wallcycle_start(wcycle,ewcMOVEF);
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_move_f(cr->dd,f,fr->fshift);
 +                /* Do we need to communicate the separate force array
 +                 * for terms that do not contribute to the single sum virial?
 +                 * Position restraints and electric fields do not introduce
 +                 * inter-cg forces, only full electrostatics methods do.
 +                 * When we do not calculate the virial, fr->f_novirsum = f,
 +                 * so we have already communicated these forces.
 +                 */
 +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
 +                    (flags & GMX_FORCE_VIRIAL))
 +                {
 +                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
 +                }
 +                if (bSepLRF)
 +                {
 +                    /* We should not update the shift forces here,
 +                     * since f_twin is already included in f.
 +                     */
 +                    dd_move_f(cr->dd,fr->f_twin,NULL);
 +                }
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEF);
 +        }
 +    }
 + 
 +    if (bUseOrEmulGPU)
 +    {
 +        /* wait for local forces (or calculate in emulation mode) */
 +        if (bUseGPU)
 +        {
 +            wallcycle_start(wcycle,ewcWAIT_GPU_NB_L);
 +            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 +                                nbv->grp[eintLocal].nbat,
 +                                flags, eatLocal,
 +                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                fr->fshift);
 +            wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
 +
 +            /* now clear the GPU outputs while we finish the step on the CPU */
 +            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
 +        }
 +        else
 +        {            
 +            wallcycle_start_nocount(wcycle,ewcFORCE);
 +            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
 +                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
 +                         nrnb, wcycle);
 +            wallcycle_stop(wcycle,ewcFORCE);
 +        }
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
 +        {
 +            /* skip the reduction if there was no non-local work to do */
 +            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatLocal,
 +                                           nbv->grp[eintLocal].nbat,f);
 +        }
 +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +    }
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_force_flop_stop(cr->dd,nrnb);
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        if (IR_ELEC_FIELD(*inputrec))
 +        {
 +            /* Compute forces due to electric field */
 +            calc_f_el(MASTER(cr) ? field : NULL,
 +                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
 +                      inputrec->ex,inputrec->et,t);
 +        }
 +
 +        /* If we have NoVirSum forces, but we do not calculate the virial,
 +         * we sum fr->f_novirum=f later.
 +         */
 +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
 +        {
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +
 +            if (bSepLRF)
 +            {
 +                wallcycle_start(wcycle,ewcVSITESPREAD);
 +                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
 +                               nrnb,
 +                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +                wallcycle_stop(wcycle,ewcVSITESPREAD);
 +            }
 +        }
 +
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Calculation of the virial must be done after vsites! */
 +            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
 +                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
 +        }
 +    }
 +
 +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    {
 +        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
 +                               f,vir_force,mdatoms,enerd,lambda,t);
 +    }
 +
 +    if (PAR(cr) && !(cr->duty & DUTY_PME))
 +    {
 +        /* In case of node-splitting, the PP nodes receive the long-range 
 +         * forces, virial and energy from the PME nodes here.
 +         */    
 +        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
 +    }
 +
 +    if (bDoForces)
 +    {
 +        post_process_forces(fplog,cr,step,nrnb,wcycle,
 +                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
 +                            flags);
 +    }
 +    
 +    /* Sum the potential energy terms from group contributions */
 +    sum_epot(&(inputrec->opts),&(enerd->grpp),enerd->term);
 +}
 +
 +void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    int    cg0,cg1,i,j;
 +    int    start,homenr;
 +    double mu[2*DIM];
 +    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
 +    gmx_bool   bDoLongRangeNS,bDoForces,bDoPotential,bSepLRF;
 +    gmx_bool   bDoAdressWF;
 +    matrix boxs;
 +    rvec   vzero,box_diag;
 +    real   e,v,dvdlambda[efptNR];
 +    t_pbc  pbc;
 +    float  cycles_pme,cycles_force;
 +
 +    start  = mdatoms->start;
 +    homenr = mdatoms->homenr;
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
 +
 +    clear_mat(vir_force);
 +
 +    if (PARTDECOMP(cr))
 +    {
 +        pd_cg_range(cr,&cg0,&cg1);
 +    }
 +    else
 +    {
 +        cg0 = 0;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            cg1 = cr->dd->ncg_tot;
 +        }
 +        else
 +        {
 +            cg1 = top->cgs.nr;
 +        }
 +        if (fr->n_tpi > 0)
 +        {
 +            cg1--;
 +        }
 +    }
 +
 +    bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
 +    bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE);
 +    /* Should we update the long-range neighborlists at this step? */
 +    bDoLongRangeNS = fr->bTwinRange && bNS;
 +    /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
 +    bFillGrid      = (bNS && bStateChanged);
 +    bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
 +    bDoForces      = (flags & GMX_FORCE_FORCES);
 +    bDoPotential   = (flags & GMX_FORCE_ENERGY);
 +    bSepLRF        = ((inputrec->nstcalclr>1) && bDoForces &&
 +                      (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
 +
 +    /* should probably move this to the forcerec since it doesn't change */
 +    bDoAdressWF   = ((fr->adress_type!=eAdressOff));
 +
 +    if (bStateChanged)
 +    {
 +        update_forcerec(fplog,fr,box);
 +
 +        if (NEED_MUTOT(*inputrec))
 +        {
 +            /* Calculate total (local) dipole moment in a temporary common array.
 +             * This makes it possible to sum them over nodes faster.
 +             */
 +            calc_mu(start,homenr,
 +                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
 +                    mu,mu+DIM);
 +        }
 +    }
 +
 +    if (fr->ePBC != epbcNONE) { 
 +        /* Compute shift vectors every step,
 +         * because of pressure coupling or box deformation!
 +         */
 +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 +            calc_shifts(box,fr->shift_vec);
 +
 +        if (bCalcCGCM) { 
 +            put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
 +                    &(top->cgs),x,fr->cg_cm);
 +            inc_nrnb(nrnb,eNR_CGCM,homenr);
 +            inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
 +        } 
 +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
 +            unshift_self(graph,box,x);
 +        }
 +    } 
 +    else if (bCalcCGCM) {
 +        calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
 +        inc_nrnb(nrnb,eNR_CGCM,homenr);
 +    }
 +
 +    if (bCalcCGCM) {
 +        if (PAR(cr)) {
 +            move_cgcm(fplog,cr,fr->cg_cm);
 +        }
 +        if (gmx_debug_at)
 +            pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (!(cr->duty & DUTY_PME)) {
 +        /* Send particle coordinates to the pme nodes.
 +         * Since this is only implemented for domain decomposition
 +         * and domain decomposition does not use the graph,
 +         * we do not need to worry about shifting.
 +         */    
 +
 +        wallcycle_start(wcycle,ewcPP_PMESENDX);
 +
 +        bBS = (inputrec->nwall == 2);
 +        if (bBS) {
 +            copy_mat(box,boxs);
 +            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +        }
 +
 +        gmx_pme_send_x(cr,bBS ? boxs : box,x,
 +                       mdatoms->nChargePerturbed,lambda[efptCOUL],
 +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
 +
 +        wallcycle_stop(wcycle,ewcPP_PMESENDX);
 +    }
 +#endif /* GMX_MPI */
 +
 +    /* Communicate coordinates and sum dipole if necessary */
 +    if (PAR(cr))
 +    {
 +        wallcycle_start(wcycle,ewcMOVEX);
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_move_x(cr->dd,box,x);
 +        }
 +        else
 +        {
 +            move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
 +        }
 +        wallcycle_stop(wcycle,ewcMOVEX);
 +    }
 +
 +    /* update adress weight beforehand */
 +    if(bStateChanged && bDoAdressWF)
 +    {
 +        /* need pbc for adress weight calculation with pbc_dx */
 +        set_pbc(&pbc,inputrec->ePBC,box);
 +        if(fr->adress_site == eAdressSITEcog)
 +        {
 +            update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
 +                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else if (fr->adress_site == eAdressSITEcom)
 +        {
 +            update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else if (fr->adress_site == eAdressSITEatomatom){
 +            update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else
 +        {
 +            update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                       inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +    }
 +
 +    if (NEED_MUTOT(*inputrec))
 +    {
 +
 +        if (bStateChanged)
 +        {
 +            if (PAR(cr))
 +            {
 +                gmx_sumd(2*DIM,mu,cr);
 +            }
 +            for(i=0; i<2; i++)
 +            {
 +                for(j=0;j<DIM;j++)
 +                {
 +                    fr->mu_tot[i][j] = mu[i*DIM + j];
 +                }
 +            }
 +        }
 +        if (fr->efep == efepNO)
 +        {
 +            copy_rvec(fr->mu_tot[0],mu_tot);
 +        }
 +        else
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                mu_tot[j] =
 +                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
 +            }
 +        }
 +    }
 +
 +    /* Reset energies */
 +    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
 +    clear_rvecs(SHIFTS,fr->fshift);
 +
 +    if (bNS)
 +    {
 +        wallcycle_start(wcycle,ewcNS);
 +
 +        if (graph && bStateChanged)
 +        {
 +            /* Calculate intramolecular shift vectors to make molecules whole */
 +            mk_mshift(fplog,graph,fr->ePBC,box,x);
 +        }
 +
 +        /* Do the actual neighbour searching and if twin range electrostatics
 +         * also do the calculation of long range forces and energies.
 +         */
 +        for (i=0;i<efptNR;i++) {dvdlambda[i] = 0;}
 +        ns(fplog,fr,x,box,
 +           groups,&(inputrec->opts),top,mdatoms,
 +           cr,nrnb,lambda,dvdlambda,&enerd->grpp,bFillGrid,
 +           bDoLongRangeNS);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,"LR non-bonded",0.0,dvdlambda);
 +        }
 +        enerd->dvdl_lin[efptVDW] += dvdlambda[efptVDW];
 +        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
 +
 +        wallcycle_stop(wcycle,ewcNS);
 +    }
 +
 +    if (inputrec->implicit_solvent && bNS)
 +    {
 +        make_gb_nblist(cr,inputrec->gb_algorithm,inputrec->rlist,
 +                       x,box,fr,&top->idef,graph,fr->born);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (!(cr->duty & DUTY_PME))
 +        {
 +            wallcycle_start(wcycle,ewcPPDURINGPME);
 +            dd_force_flop_start(cr->dd,nrnb);
 +        }
 +    }
 +
 +    if (inputrec->bRot)
 +    {
 +        /* Enforced rotation has its own cycle counter that starts after the collective
 +         * coordinates have been communicated. It is added to ddCyclF to allow
 +         * for proper load-balancing */
 +        wallcycle_start(wcycle,ewcROT);
 +        do_rotation(cr,inputrec,box,x,t,step,wcycle,bNS);
 +        wallcycle_stop(wcycle,ewcROT);
 +    }
 +
 +    /* Start the force cycle counter.
 +     * This counter is stopped in do_forcelow_level.
 +     * No parallel communication should occur while this counter is running,
 +     * since that will interfere with the dynamic load balancing.
 +     */
 +    wallcycle_start(wcycle,ewcFORCE);
 +    
 +    if (bDoForces)
 +    {
 +        /* Reset forces for which the virial is calculated separately:
 +         * PME/Ewald forces if necessary */
 +        if (fr->bF_NoVirSum)
 +        {
 +            if (flags & GMX_FORCE_VIRIAL)
 +            {
 +                fr->f_novirsum = fr->f_novirsum_alloc;
 +                if (fr->bDomDec)
 +                {
 +                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
 +                }
 +                else
 +                {
 +                    clear_rvecs(homenr,fr->f_novirsum+start);
 +                }
 +            }
 +            else
 +            {
 +                /* We are not calculating the pressure so we do not need
 +                 * a separate array for forces that do not contribute
 +                 * to the pressure.
 +                 */
 +                fr->f_novirsum = f;
 +            }
 +        }
 +
 +        /* Clear the short- and long-range forces */
 +        clear_rvecs(fr->natoms_force_constr,f);
 +        if(bSepLRF && do_per_step(step,inputrec->nstcalclr))
 +        {
 +            clear_rvecs(fr->natoms_force_constr,fr->f_twin);
 +        }
 +        
 +        clear_rvec(fr->vir_diag_posres);
 +    }
 +    if (inputrec->ePull == epullCONSTRAINT)
 +    {
 +        clear_pull_forces(inputrec->pull);
 +    }
 +
 +    /* update QMMMrec, if necessary */
 +    if(fr->bQMMM)
 +    {
 +        update_QMMMrec(cr,fr,x,mdatoms,box,top);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    {
 +        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
 +                       f,enerd,lambda,fr);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 +    {
 +        /* Flat-bottomed position restraints always require full pbc */
 +        if(!(bStateChanged && bDoAdressWF))
 +        {
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +        }
 +        v = fbposres(top->idef.il[F_FBPOSRES].nr,top->idef.il[F_FBPOSRES].iatoms,
 +                     top->idef.iparams_fbposres,
 +                     (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 +                     inputrec->ePBC==epbcNONE ? NULL : &pbc,
 +                     fr->rc_scaling,fr->ePBC,fr->posres_com);
 +        enerd->term[F_FBPOSRES] += v;
 +        inc_nrnb(nrnb,eNR_FBPOSRES,top->idef.il[F_FBPOSRES].nr/2);
 +    }
 +
 +    /* Compute the bonded and non-bonded energies and optionally forces */
 +    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
 +                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
 +                      x,hist,f, bSepLRF ? fr->f_twin : f,enerd,fcd,mtop,top,fr->born,
 +                      &(top->atomtypes),bBornRadii,box,
 +                      inputrec->fepvals,lambda,
 +                      graph,&(top->excls),fr->mu_tot,
 +                      flags,
 +                      &cycles_pme);
 +
 +    if(bSepLRF)
 +    {
 +        if (do_per_step(step,inputrec->nstcalclr))
 +        {
 +            /* Add the long range forces to the short range forces */
 +            for(i=0; i<fr->natoms_force_constr; i++)
 +            {
 +                rvec_add(fr->f_twin[i],f[i],f[i]);
 +            }
 +        }
 +    }
 +    
 +    cycles_force = wallcycle_stop(wcycle,ewcFORCE);
 +
 +    if (ed)
 +    {
 +        do_flood(fplog,cr,x,f,ed,box,step,bNS);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_force_flop_stop(cr->dd,nrnb);
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        if (IR_ELEC_FIELD(*inputrec))
 +        {
 +            /* Compute forces due to electric field */
 +            calc_f_el(MASTER(cr) ? field : NULL,
 +                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
 +                      inputrec->ex,inputrec->et,t);
 +        }
 +
 +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
 +        {
 +            /* Compute thermodynamic force in hybrid AdResS region */
 +            adress_thermo_force(start,homenr,&(top->cgs),x,fr->f_novirsum,fr,mdatoms,
 +                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +
 +        /* Communicate the forces */
 +        if (PAR(cr))
 +        {
 +            wallcycle_start(wcycle,ewcMOVEF);
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_move_f(cr->dd,f,fr->fshift);
 +                /* Do we need to communicate the separate force array
 +                 * for terms that do not contribute to the single sum virial?
 +                 * Position restraints and electric fields do not introduce
 +                 * inter-cg forces, only full electrostatics methods do.
 +                 * When we do not calculate the virial, fr->f_novirsum = f,
 +                 * so we have already communicated these forces.
 +                 */
 +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
 +                    (flags & GMX_FORCE_VIRIAL))
 +                {
 +                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
 +                }
 +                if (bSepLRF)
 +                {
 +                    /* We should not update the shift forces here,
 +                     * since f_twin is already included in f.
 +                     */
 +                    dd_move_f(cr->dd,fr->f_twin,NULL);
 +                }
 +            }
 +            else
 +            {
 +                pd_move_f(cr,f,nrnb);
 +                if (bSepLRF)
 +                {
 +                    pd_move_f(cr,fr->f_twin,nrnb);
 +                }
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEF);
 +        }
 +
 +        /* If we have NoVirSum forces, but we do not calculate the virial,
 +         * we sum fr->f_novirum=f later.
 +         */
 +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
 +        {
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +
 +            if (bSepLRF)
 +            {
 +                wallcycle_start(wcycle,ewcVSITESPREAD);
 +                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
 +                               nrnb,
 +                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +                wallcycle_stop(wcycle,ewcVSITESPREAD);
 +            }
 +        }
 +
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Calculation of the virial must be done after vsites! */
 +            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
 +                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
 +        }
 +    }
 +
 +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    {
 +        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
 +                               f,vir_force,mdatoms,enerd,lambda,t);
 +    }
 +
 +    /* Add the forces from enforced rotation potentials (if any) */
 +    if (inputrec->bRot)
 +    {
 +        wallcycle_start(wcycle,ewcROTadd);
 +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr,step,t);
 +        wallcycle_stop(wcycle,ewcROTadd);
 +    }
 +
 +    if (PAR(cr) && !(cr->duty & DUTY_PME))
 +    {
 +        /* In case of node-splitting, the PP nodes receive the long-range 
 +         * forces, virial and energy from the PME nodes here.
 +         */
 +        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
 +    }
 +
 +    if (bDoForces)
 +    {
 +        post_process_forces(fplog,cr,step,nrnb,wcycle,
 +                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
 +                            flags);
 +    }
 +
 +    /* Sum the potential energy terms from group contributions */
 +    sum_epot(&(inputrec->opts),&(enerd->grpp),enerd->term);
 +}
 +
 +void do_force(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr,
 +              gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    /* modify force flag if not doing nonbonded */
 +    if (!fr->bNonbonded)
 +    {
 +        flags &= ~GMX_FORCE_NONBONDED;
 +    }
 +
 +    switch (inputrec->cutoff_scheme)
 +    {
 +        case ecutsVERLET:
 +            do_force_cutsVERLET(fplog, cr, inputrec,
 +                                step, nrnb, wcycle,
 +                                top, mtop,
 +                                groups,
 +                                box, x, hist,
 +                                f, vir_force,
 +                                mdatoms,
 +                                enerd, fcd,
 +                                lambda, graph,
 +                                fr, fr->ic, 
 +                                vsite, mu_tot,
 +                                t, field, ed,
 +                                bBornRadii,
 +                                flags);
 +            break;
 +        case ecutsGROUP:
 +             do_force_cutsGROUP(fplog, cr, inputrec,
 +                                step, nrnb, wcycle,
 +                                top, mtop,
 +                                groups,
 +                                box, x, hist,
 +                                f, vir_force,
 +                                mdatoms,
 +                                enerd, fcd,
 +                                lambda, graph,
 +                                fr, vsite, mu_tot,
 +                                t, field, ed,
 +                                bBornRadii,
 +                                flags);
 +            break;
 +        default:
 +            gmx_incons("Invalid cut-off scheme passed!");
 +    }
 +}
 +
 +
 +void do_constrain_first(FILE *fplog,gmx_constr_t constr,
 +                        t_inputrec *ir,t_mdatoms *md,
 +                        t_state *state,rvec *f,
 +                        t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
 +                        t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir)
 +{
 +    int    i,m,start,end;
 +    gmx_large_int_t step;
 +    real   dt=ir->delta_t;
 +    real   dvdl_dum;
 +    rvec   *savex;
 +
 +    snew(savex,state->natoms);
 +
 +    start = md->start;
 +    end   = md->homenr + start;
 +
 +    if (debug)
 +        fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",
 +                start,md->homenr,end);
 +    /* Do a first constrain to reset particles... */
 +    step = ir->init_step;
 +    if (fplog)
 +    {
 +        char buf[STEPSTRSIZE];
 +        fprintf(fplog,"\nConstraining the starting coordinates (step %s)\n",
 +                gmx_step_str(step,buf));
 +    }
 +    dvdl_dum = 0;
 +
 +    /* constrain the current position */
 +    constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +              ir,NULL,cr,step,0,md,
 +              state->x,state->x,NULL,
 +              fr->bMolPBC,state->box,
 +              state->lambda[efptBONDED],&dvdl_dum,
 +              NULL,NULL,nrnb,econqCoord,
 +              ir->epc==epcMTTK,state->veta,state->veta);
 +    if (EI_VV(ir->eI))
 +    {
 +        /* constrain the inital velocity, and save it */
 +        /* also may be useful if we need the ekin from the halfstep for velocity verlet */
 +        /* might not yet treat veta correctly */
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,0,md,
 +                  state->x,state->v,state->v,
 +                  fr->bMolPBC,state->box,
 +                  state->lambda[efptBONDED],&dvdl_dum,
 +                  NULL,NULL,nrnb,econqVeloc,
 +                  ir->epc==epcMTTK,state->veta,state->veta);
 +    }
 +    /* constrain the inital velocities at t-dt/2 */
 +    if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
 +    {
 +        for(i=start; (i<end); i++)
 +        {
 +            for(m=0; (m<DIM); m++)
 +            {
 +                /* Reverse the velocity */
 +                state->v[i][m] = -state->v[i][m];
 +                /* Store the position at t-dt in buf */
 +                savex[i][m] = state->x[i][m] + dt*state->v[i][m];
 +            }
 +        }
 +    /* Shake the positions at t=-dt with the positions at t=0
 +     * as reference coordinates.
 +         */
 +        if (fplog)
 +        {
 +            char buf[STEPSTRSIZE];
 +            fprintf(fplog,"\nConstraining the coordinates at t0-dt (step %s)\n",
 +                    gmx_step_str(step,buf));
 +        }
 +        dvdl_dum = 0;
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,-1,md,
 +                  state->x,savex,NULL,
 +                  fr->bMolPBC,state->box,
 +                  state->lambda[efptBONDED],&dvdl_dum,
 +                  state->v,NULL,nrnb,econqCoord,
 +                  ir->epc==epcMTTK,state->veta,state->veta);
 +        
 +        for(i=start; i<end; i++) {
 +            for(m=0; m<DIM; m++) {
 +                /* Re-reverse the velocities */
 +                state->v[i][m] = -state->v[i][m];
 +            }
 +        }
 +    }
 +    sfree(savex);
 +}
 +
 +void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr)
 +{
 +  double eners[2],virs[2],enersum,virsum,y0,f,g,h;
 +  double r0,r1,r,rc3,rc9,ea,eb,ec,pa,pb,pc,pd;
 +  double invscale,invscale2,invscale3;
 +  int    ri0,ri1,ri,i,offstart,offset;
 +  real   scale,*vdwtab,tabfactor,tmp;
 +
 +  fr->enershiftsix = 0;
 +  fr->enershifttwelve = 0;
 +  fr->enerdiffsix = 0;
 +  fr->enerdifftwelve = 0;
 +  fr->virdiffsix = 0;
 +  fr->virdifftwelve = 0;
 +
 +  if (eDispCorr != edispcNO) {
 +    for(i=0; i<2; i++) {
 +      eners[i] = 0;
 +      virs[i]  = 0;
 +    }
 +    if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT)) {
 +      if (fr->rvdw_switch == 0)
 +      gmx_fatal(FARGS,
 +                "With dispersion correction rvdw-switch can not be zero "
 +                "for vdw-type = %s",evdw_names[fr->vdwtype]);
 +
 +      scale  = fr->nblists[0].table_elec_vdw.scale;
 +      vdwtab = fr->nblists[0].table_vdw.data;
 +
 +      /* Round the cut-offs to exact table values for precision */
 +      ri0 = floor(fr->rvdw_switch*scale);
 +      ri1 = ceil(fr->rvdw*scale);
 +      r0  = ri0/scale;
 +      r1  = ri1/scale;
 +      rc3 = r0*r0*r0;
 +      rc9  = rc3*rc3*rc3;
 +
 +      if (fr->vdwtype == evdwSHIFT)
 +      {
 +          /* Determine the constant energy shift below rvdw_switch.
 +           * Table has a scale factor since we have scaled it down to compensate
 +           * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
 +           */
 +          fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
 +          fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
 +      }
 +      /* Add the constant part from 0 to rvdw_switch.
 +       * This integration from 0 to rvdw_switch overcounts the number
 +       * of interactions by 1, as it also counts the self interaction.
 +       * We will correct for this later.
 +       */
 +      eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
 +      eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
 +
 +      invscale = 1.0/(scale);
 +      invscale2 = invscale*invscale;
 +      invscale3 = invscale*invscale2;
 +
 +      /* following summation derived from cubic spline definition,
 +      Numerical Recipies in C, second edition, p. 113-116.  Exact
 +      for the cubic spline.  We first calculate the negative of
 +      the energy from rvdw to rvdw_switch, assuming that g(r)=1,
 +      and then add the more standard, abrupt cutoff correction to
 +      that result, yielding the long-range correction for a
 +      switched function.  We perform both the pressure and energy
 +      loops at the same time for simplicity, as the computational
 +      cost is low. */
 +
 +      for (i=0;i<2;i++) {
 +        enersum = 0.0; virsum = 0.0;
 +        if (i==0)
 +        {
 +            offstart = 0;
 +            /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
 +             * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
 +             * up (to save flops in kernels), we need to correct for this.
 +             */
 +            tabfactor = 6.0;
 +        }
 +        else
 +        {
 +            offstart = 4;
 +            tabfactor = 12.0;
 +        }
 +      for (ri=ri0; ri<ri1; ri++) {
 +          r = ri*invscale;
 +          ea = invscale3;
 +          eb = 2.0*invscale2*r;
 +          ec = invscale*r*r;
 +
 +          pa = invscale3;
 +          pb = 3.0*invscale2*r;
 +          pc = 3.0*invscale*r*r;
 +          pd = r*r*r;
 +
 +          /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
 +          offset = 8*ri + offstart;
 +          y0 = vdwtab[offset];
 +          f  = vdwtab[offset+1];
 +          g  = vdwtab[offset+2];
 +          h  = vdwtab[offset+3];
 +
 +          enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
 +          virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
 +        }
 +          
 +        enersum *= 4.0*M_PI*tabfactor;
 +        virsum  *= 4.0*M_PI*tabfactor;
 +        eners[i] -= enersum;
 +        virs[i]  -= virsum;
 +      }
 +
 +      /* now add the correction for rvdw_switch to infinity */
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    }
 +    else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER)) {
 +      if (fr->vdwtype == evdwUSER && fplog)
 +      fprintf(fplog,
 +              "WARNING: using dispersion correction with user tables\n");
 +      rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
 +      rc9  = rc3*rc3*rc3;
 +      /* Contribution beyond the cut-off */
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      if (fr->vdw_modifier==eintmodPOTSHIFT) {
 +          /* Contribution within the cut-off */
 +          eners[0] += -4.0*M_PI/(3.0*rc3);
 +          eners[1] +=  4.0*M_PI/(3.0*rc9);
 +      }
 +      /* Contribution beyond the cut-off */
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    } else {
 +      gmx_fatal(FARGS,
 +              "Dispersion correction is not implemented for vdw-type = %s",
 +              evdw_names[fr->vdwtype]);
 +    }
 +    fr->enerdiffsix    = eners[0];
 +    fr->enerdifftwelve = eners[1];
 +    /* The 0.5 is due to the Gromacs definition of the virial */
 +    fr->virdiffsix     = 0.5*virs[0];
 +    fr->virdifftwelve  = 0.5*virs[1];
 +  }
 +}
 +
 +void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
 +                   gmx_large_int_t step,int natoms,
 +                   matrix box,real lambda,tensor pres,tensor virial,
 +                   real *prescorr, real *enercorr, real *dvdlcorr)
 +{
 +    gmx_bool bCorrAll,bCorrPres;
 +    real dvdlambda,invvol,dens,ninter,avcsix,avctwelve,enerdiff,svir=0,spres=0;
 +    int  m;
 +
 +    *prescorr = 0;
 +    *enercorr = 0;
 +    *dvdlcorr = 0;
 +
 +    clear_mat(virial);
 +    clear_mat(pres);
 +
 +    if (ir->eDispCorr != edispcNO) {
 +        bCorrAll  = (ir->eDispCorr == edispcAllEner ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +        bCorrPres = (ir->eDispCorr == edispcEnerPres ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +
 +        invvol = 1/det(box);
 +        if (fr->n_tpi)
 +        {
 +            /* Only correct for the interactions with the inserted molecule */
 +            dens = (natoms - fr->n_tpi)*invvol;
 +            ninter = fr->n_tpi;
 +        }
 +        else
 +        {
 +            dens = natoms*invvol;
 +            ninter = 0.5*natoms;
 +        }
 +
 +        if (ir->efep == efepNO)
 +        {
 +            avcsix    = fr->avcsix[0];
 +            avctwelve = fr->avctwelve[0];
 +        }
 +        else
 +        {
 +            avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
 +            avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
 +        }
 +
 +        enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
 +        *enercorr += avcsix*enerdiff;
 +        dvdlambda = 0.0;
 +        if (ir->efep != efepNO)
 +        {
 +            dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
 +        }
 +        if (bCorrAll)
 +        {
 +            enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
 +            *enercorr += avctwelve*enerdiff;
 +            if (fr->efep != efepNO)
 +            {
 +                dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
 +            }
 +        }
 +
 +        if (bCorrPres)
 +        {
 +            svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
 +            if (ir->eDispCorr == edispcAllEnerPres)
 +            {
 +                svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
 +            }
 +            /* The factor 2 is because of the Gromacs virial definition */
 +            spres = -2.0*invvol*svir*PRESFAC;
 +
 +            for(m=0; m<DIM; m++) {
 +                virial[m][m] += svir;
 +                pres[m][m] += spres;
 +            }
 +            *prescorr += spres;
 +        }
 +
 +        /* Can't currently control when it prints, for now, just print when degugging */
 +        if (debug)
 +        {
 +            if (bCorrAll) {
 +                fprintf(debug,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                        avcsix,avctwelve);
 +            }
 +            if (bCorrPres)
 +            {
 +                fprintf(debug,
 +                        "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
 +                        *enercorr,spres,svir);
 +            }
 +            else
 +            {
 +                fprintf(debug,"Long Range LJ corr.: Epot %10g\n",*enercorr);
 +            }
 +        }
 +
 +        if (fr->bSepDVDL && do_per_step(step,ir->nstlog))
 +        {
 +            fprintf(fplog,sepdvdlformat,"Dispersion correction",
 +                    *enercorr,dvdlambda);
 +        }
 +        if (fr->efep != efepNO)
 +        {
 +            *dvdlcorr += dvdlambda;
 +        }
 +    }
 +}
 +
 +void do_pbc_first(FILE *fplog,matrix box,t_forcerec *fr,
 +                t_graph *graph,rvec x[])
 +{
 +  if (fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +  calc_shifts(box,fr->shift_vec);
 +  if (graph) {
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 1",graph);
 +    shift_self(graph,box,x);
 +    /* By doing an extra mk_mshift the molecules that are broken
 +     * because they were e.g. imported from another software
 +     * will be made whole again. Such are the healing powers
 +     * of GROMACS.
 +     */
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 2",graph);
 +  }
 +  if (fplog)
 +    fprintf(fplog,"Done rmpbc\n");
 +}
 +
 +static void low_do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +                          gmx_mtop_t *mtop,rvec x[],
 +                          gmx_bool bFirst)
 +{
 +  t_graph *graph;
 +  int mb,as,mol;
 +  gmx_molblock_t *molb;
 +
 +  if (bFirst && fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +
 +  snew(graph,1);
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    if (molb->natoms_mol == 1 ||
 +      (!bFirst && mtop->moltype[molb->type].cgs.nr == 1)) {
 +      /* Just one atom or charge group in the molecule, no PBC required */
 +      as += molb->nmol*molb->natoms_mol;
 +    } else {
 +      /* Pass NULL iso fplog to avoid graph prints for each molecule type */
 +      mk_graph_ilist(NULL,mtop->moltype[molb->type].ilist,
 +                   0,molb->natoms_mol,FALSE,FALSE,graph);
 +
 +      for(mol=0; mol<molb->nmol; mol++) {
 +      mk_mshift(fplog,graph,ePBC,box,x+as);
 +
 +      shift_self(graph,box,x+as);
 +      /* The molecule is whole now.
 +       * We don't need the second mk_mshift call as in do_pbc_first,
 +       * since we no longer need this graph.
 +       */
 +
 +      as += molb->natoms_mol;
 +      }
 +      done_graph(graph);
 +    }
 +  }
 +  sfree(graph);
 +}
 +
 +void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
 +                     gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,TRUE);
 +}
 +
 +void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +               gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,FALSE);
 +}
 +
 +void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
 +                t_inputrec *inputrec,
 +                t_nrnb nrnb[],gmx_wallcycle_t wcycle,
 +                gmx_runtime_t *runtime,
 +                wallclock_gpu_t *gputimes,
 +                int omp_nth_pp,
 +                gmx_bool bWriteStat)
 +{
 +    int    i,j;
 +    t_nrnb *nrnb_tot=NULL;
 +    real   delta_t;
 +    double nbfs,mflop;
 +
 +    wallcycle_sum(cr,wcycle);
 +
 +    if (cr->nnodes > 1)
 +    {
 +        snew(nrnb_tot,1);
 +#ifdef GMX_MPI
 +        MPI_Allreduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +    }
 +    else
 +    {
 +        nrnb_tot = nrnb;
 +    }
 +
 +#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
 +    if (cr->nnodes > 1)
 +    {
 +        /* reduce nodetime over all MPI processes in the current simulation */
 +        double sum;
 +        MPI_Allreduce(&runtime->proctime,&sum,1,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        runtime->proctime = sum;
 +    }
 +#endif
 +
 +    if (SIMMASTER(cr))
 +    {
 +        print_flop(fplog,nrnb_tot,&nbfs,&mflop);
 +    }
 +    if (cr->nnodes > 1)
 +    {
 +        sfree(nrnb_tot);
 +    }
 +
 +    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
 +    {
 +        print_dd_statistics(cr,inputrec,fplog);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (PARTDECOMP(cr))
 +    {
 +        if (MASTER(cr))
 +        {
 +            t_nrnb     *nrnb_all;
 +            int        s;
 +            MPI_Status stat;
 +
 +            snew(nrnb_all,cr->nnodes);
 +            nrnb_all[0] = *nrnb;
 +            for(s=1; s<cr->nnodes; s++)
 +            {
 +                MPI_Recv(nrnb_all[s].n,eNRNB,MPI_DOUBLE,s,0,
 +                         cr->mpi_comm_mysim,&stat);
 +            }
 +            pr_load(fplog,cr,nrnb_all);
 +            sfree(nrnb_all);
 +        }
 +        else
 +        {
 +            MPI_Send(nrnb->n,eNRNB,MPI_DOUBLE,MASTERRANK(cr),0,
 +                     cr->mpi_comm_mysim);
 +        }
 +    }
 +#endif
 +
 +    if (SIMMASTER(cr))
 +    {
 +        wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
 +                        wcycle,gputimes);
 +
 +        if (EI_DYNAMICS(inputrec->eI))
 +        {
 +            delta_t = inputrec->delta_t;
 +        }
 +        else
 +        {
 +            delta_t = 0;
 +        }
 +
 +        if (fplog)
 +        {
 +            print_perf(fplog,runtime->proctime,runtime->realtime,
 +                       cr->nnodes-cr->npmenodes,
 +                       runtime->nsteps_done,delta_t,nbfs,mflop,
 +                       omp_nth_pp);
 +        }
 +        if (bWriteStat)
 +        {
 +            print_perf(stderr,runtime->proctime,runtime->realtime,
 +                       cr->nnodes-cr->npmenodes,
 +                       runtime->nsteps_done,delta_t,nbfs,mflop,
 +                       omp_nth_pp);
 +        }
 +    }
 +}
 +
 +extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
 +{
 +    /* this function works, but could probably use a logic rewrite to keep all the different
 +       types of efep straight. */
 +
 +    int i;
 +    t_lambda *fep = ir->fepvals;
 +
 +    if ((ir->efep==efepNO) && (ir->bSimTemp == FALSE)) {
 +        for (i=0;i<efptNR;i++)  {
 +            lambda[i] = 0.0;
 +            if (lam0)
 +            {
 +                lam0[i] = 0.0;
 +            }
 +        }
 +        return;
 +    } else {
 +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
 +                                             if checkpoint is set -- a kludge is in for now
 +                                             to prevent this.*/
 +        for (i=0;i<efptNR;i++)
 +        {
 +            /* overwrite lambda state with init_lambda for now for backwards compatibility */
 +            if (fep->init_lambda>=0) /* if it's -1, it was never initializd */
 +            {
 +                lambda[i] = fep->init_lambda;
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +            else
 +            {
 +                lambda[i] = fep->all_lambda[i][*fep_state];
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +        }
 +        if (ir->bSimTemp) {
 +            /* need to rescale control temperatures to match current state */
 +            for (i=0;i<ir->opts.ngtc;i++) {
 +                if (ir->opts.ref_t[i] > 0) {
 +                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Send to the log the information on the current lambdas */
 +    if (fplog != NULL)
 +    {
 +        fprintf(fplog,"Initial vector of lambda components:[ ");
 +        for (i=0;i<efptNR;i++)
 +        {
 +            fprintf(fplog,"%10.4f ",lambda[i]);
 +        }
 +        fprintf(fplog,"]\n");
 +    }
 +    return;
 +}
 +
 +
 +void init_md(FILE *fplog,
 +             t_commrec *cr,t_inputrec *ir,const output_env_t oenv,
 +             double *t,double *t0,
 +             real *lambda, int *fep_state, double *lam0,
 +             t_nrnb *nrnb,gmx_mtop_t *mtop,
 +             gmx_update_t *upd,
 +             int nfile,const t_filenm fnm[],
 +             gmx_mdoutf_t **outf,t_mdebin **mdebin,
 +             tensor force_vir,tensor shake_vir,rvec mu_tot,
 +             gmx_bool *bSimAnn,t_vcm **vcm, t_state *state, unsigned long Flags)
 +{
 +    int  i,j,n;
 +    real tmpt,mod;
 +
 +    /* Initial values */
 +    *t = *t0       = ir->init_t;
 +
 +    *bSimAnn=FALSE;
 +    for(i=0;i<ir->opts.ngtc;i++)
 +    {
 +        /* set bSimAnn if any group is being annealed */
 +        if(ir->opts.annealing[i]!=eannNO)
 +        {
 +            *bSimAnn = TRUE;
 +        }
 +    }
 +    if (*bSimAnn)
 +    {
 +        update_annealing_target_temp(&(ir->opts),ir->init_t);
 +    }
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog,ir,fep_state,lambda,lam0);
 +
 +    if (upd)
 +    {
 +        *upd = init_update(fplog,ir);
 +    }
 +
 +
 +    if (vcm != NULL)
 +    {
 +        *vcm = init_vcm(fplog,&mtop->groups,ir);
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
 +    {
 +        if (ir->etc == etcBERENDSEN)
 +        {
 +            please_cite(fplog,"Berendsen84a");
 +        }
 +        if (ir->etc == etcVRESCALE)
 +        {
 +            please_cite(fplog,"Bussi2007a");
 +        }
 +    }
 +
 +    init_nrnb(nrnb);
 +
 +    if (nfile != -1)
 +    {
 +        *outf = init_mdoutf(nfile,fnm,Flags,cr,ir,oenv);
 +
 +        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
 +                              mtop,ir, (*outf)->fp_dhdl);
 +    }
 +
 +    if (ir->bAdress)
 +    {
 +      please_cite(fplog,"Fritsch12");
 +      please_cite(fplog,"Junghans10");
 +    }
 +    /* Initiate variables */
 +    clear_mat(force_vir);
 +    clear_mat(shake_vir);
 +    clear_rvec(mu_tot);
 +
 +    debug_gmx();
 +}
 +
index a14c52231a7451208404f77473b4793f6af47a7a,0000000000000000000000000000000000000000..f2b126bc82b4a334b19f493bd6c63d742378d663
mode 100644,000000..100644
--- /dev/null
@@@ -1,1773 -1,0 +1,1773 @@@
-     set_pull_init(ir,sys,state.x,state.box,oenv,opts->pull_start);
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.03
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <sys/types.h>
 +#include <math.h>
 +#include <string.h>
 +#include <errno.h>
 +#include <limits.h>
 +
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "string2.h"
 +#include "readir.h"
 +#include "toputil.h"
 +#include "topio.h"
 +#include "confio.h"
 +#include "copyrite.h"
 +#include "readir.h"
 +#include "symtab.h"
 +#include "names.h"
 +#include "grompp.h"
 +#include "random.h"
 +#include "vec.h"
 +#include "futil.h"
 +#include "statutil.h"
 +#include "splitter.h"
 +#include "sortwater.h"
 +#include "convparm.h"
 +#include "gmx_fatal.h"
 +#include "warninp.h"
 +#include "index.h"
 +#include "gmxfio.h"
 +#include "trnio.h"
 +#include "tpxio.h"
 +#include "vsite_parm.h"
 +#include "txtdump.h"
 +#include "calcgrid.h"
 +#include "add_par.h"
 +#include "enxio.h"
 +#include "perf_est.h"
 +#include "compute_io.h"
 +#include "gpp_atomtype.h"
 +#include "gpp_tomorse.h"
 +#include "mtop_util.h"
 +#include "genborn.h"
 +#include "calc_verletbuf.h"
 +
 +static int rm_interactions(int ifunc,int nrmols,t_molinfo mols[])
 +{
 +  int  i,n;
 +  
 +  n=0;
 +  /* For all the molecule types */
 +  for(i=0; i<nrmols; i++) {
 +    n += mols[i].plist[ifunc].nr;
 +    mols[i].plist[ifunc].nr=0;
 +  }
 +  return n;
 +}
 +
 +static int check_atom_names(const char *fn1, const char *fn2, 
 +                          gmx_mtop_t *mtop, t_atoms *at)
 +{
 +  int mb,m,i,j,nmismatch;
 +  t_atoms *tat;
 +#define MAXMISMATCH 20
 +
 +  if (mtop->natoms != at->nr)
 +    gmx_incons("comparing atom names");
 +  
 +  nmismatch=0;
 +  i = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    tat = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +    for(m=0; m<mtop->molblock[mb].nmol; m++) {
 +      for(j=0; j < tat->nr; j++) {
 +      if (strcmp( *(tat->atomname[j]) , *(at->atomname[i]) ) != 0) {
 +        if (nmismatch < MAXMISMATCH) {
 +          fprintf(stderr,
 +                  "Warning: atom name %d in %s and %s does not match (%s - %s)\n",
 +                  i+1, fn1, fn2, *(tat->atomname[j]), *(at->atomname[i]));
 +        } else if (nmismatch == MAXMISMATCH) {
 +          fprintf(stderr,"(more than %d non-matching atom names)\n",MAXMISMATCH);
 +        }
 +        nmismatch++;
 +      }
 +      i++;
 +      }
 +    }
 +  }
 +
 +  return nmismatch;
 +}
 +
 +static void check_eg_vs_cg(gmx_mtop_t *mtop)
 +{
 +  int astart,mb,m,cg,j,firstj;
 +  unsigned char firsteg,eg;
 +  gmx_moltype_t *molt;
 +  
 +  /* Go through all the charge groups and make sure all their
 +   * atoms are in the same energy group.
 +   */
 +  
 +  astart = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molt = &mtop->moltype[mtop->molblock[mb].type];
 +    for(m=0; m<mtop->molblock[mb].nmol; m++) {
 +      for(cg=0; cg<molt->cgs.nr;cg++) {
 +      /* Get the energy group of the first atom in this charge group */
 +      firstj = astart + molt->cgs.index[cg];
 +      firsteg = ggrpnr(&mtop->groups,egcENER,firstj);
 +      for(j=molt->cgs.index[cg]+1;j<molt->cgs.index[cg+1];j++) {
 +        eg = ggrpnr(&mtop->groups,egcENER,astart+j);
 +        if(eg != firsteg) {
 +          gmx_fatal(FARGS,"atoms %d and %d in charge group %d of molecule type '%s' are in different energy groups",
 +                    firstj+1,astart+j+1,cg+1,*molt->name);
 +        }
 +      }
 +      }
 +      astart += molt->atoms.nr;
 +    }
 +  }  
 +}
 +
 +static void check_cg_sizes(const char *topfn,t_block *cgs,warninp_t wi)
 +{
 +    int  maxsize,cg;
 +    char warn_buf[STRLEN];
 +
 +    maxsize = 0;
 +    for(cg=0; cg<cgs->nr; cg++)
 +    {
 +        maxsize = max(maxsize,cgs->index[cg+1]-cgs->index[cg]);
 +    }
 +    
 +    if (maxsize > MAX_CHARGEGROUP_SIZE)
 +    {
 +        gmx_fatal(FARGS,"The largest charge group contains %d atoms. The maximum is %d.",maxsize,MAX_CHARGEGROUP_SIZE);
 +    }
 +    else if (maxsize > 10)
 +    {
 +        set_warning_line(wi,topfn,-1);
 +        sprintf(warn_buf,
 +                "The largest charge group contains %d atoms.\n"
 +                "Since atoms only see each other when the centers of geometry of the charge groups they belong to are within the cut-off distance, too large charge groups can lead to serious cut-off artifacts.\n"
 +                "For efficiency and accuracy, charge group should consist of a few atoms.\n"
 +                "For all-atom force fields use: CH3, CH2, CH, NH2, NH, OH, CO2, CO, etc.",
 +                maxsize);
 +        warning_note(wi,warn_buf);
 +    }
 +}
 +
 +static void check_bonds_timestep(gmx_mtop_t *mtop,double dt,warninp_t wi)
 +{
 +    /* This check is not intended to ensure accurate integration,
 +     * rather it is to signal mistakes in the mdp settings.
 +     * A common mistake is to forget to turn on constraints
 +     * for MD after energy minimization with flexible bonds.
 +     * This check can also detect too large time steps for flexible water
 +     * models, but such errors will often be masked by the constraints
 +     * mdp options, which turns flexible water into water with bond constraints,
 +     * but without an angle constraint. Unfortunately such incorrect use
 +     * of water models can not easily be detected without checking
 +     * for specific model names.
 +     *
 +     * The stability limit of leap-frog or velocity verlet is 4.44 steps
 +     * per oscillational period.
 +     * But accurate bonds distributions are lost far before that limit.
 +     * To allow relatively common schemes (although not common with Gromacs)
 +     * of dt=1 fs without constraints and dt=2 fs with only H-bond constraints
 +     * we set the note limit to 10.
 +     */
 +    int       min_steps_warn=5;
 +    int       min_steps_note=10;
 +    t_iparams *ip;
 +    int       molt;
 +    gmx_moltype_t *moltype,*w_moltype;
 +    t_atom    *atom;
 +    t_ilist   *ilist,*ilb,*ilc,*ils;
 +    int       ftype;
 +    int       i,a1,a2,w_a1,w_a2,j;
 +    real      twopi2,limit2,fc,re,m1,m2,period2,w_period2;
 +    gmx_bool  bFound,bWater,bWarn;
 +    char      warn_buf[STRLEN];
 +
 +    ip = mtop->ffparams.iparams;
 +
 +    twopi2 = sqr(2*M_PI);
 +
 +    limit2 = sqr(min_steps_note*dt);
 +
 +    w_a1 = w_a2 = -1;
 +    w_period2 = -1.0;
 +    
 +    w_moltype = NULL;
 +    for(molt=0; molt<mtop->nmoltype; molt++)
 +    {
 +        moltype = &mtop->moltype[molt];
 +        atom  = moltype->atoms.atom;
 +        ilist = moltype->ilist;
 +        ilc = &ilist[F_CONSTR];
 +        ils = &ilist[F_SETTLE];
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (!(ftype == F_BONDS || ftype == F_G96BONDS || ftype == F_HARMONIC))
 +            {
 +                continue;
 +            }
 +            
 +            ilb = &ilist[ftype];
 +            for(i=0; i<ilb->nr; i+=3)
 +            {
 +                fc = ip[ilb->iatoms[i]].harmonic.krA;
 +                re = ip[ilb->iatoms[i]].harmonic.rA;
 +                if (ftype == F_G96BONDS)
 +                {
 +                    /* Convert squared sqaure fc to harmonic fc */
 +                    fc = 2*fc*re;
 +                }
 +                a1 = ilb->iatoms[i+1];
 +                a2 = ilb->iatoms[i+2];
 +                m1 = atom[a1].m;
 +                m2 = atom[a2].m;
 +                if (fc > 0 && m1 > 0 && m2 > 0)
 +                {
 +                    period2 = twopi2*m1*m2/((m1 + m2)*fc);
 +                }
 +                else
 +                {
 +                    period2 = GMX_FLOAT_MAX;
 +                }
 +                if (debug)
 +                {
 +                    fprintf(debug,"fc %g m1 %g m2 %g period %g\n",
 +                            fc,m1,m2,sqrt(period2));
 +                }
 +                if (period2 < limit2)
 +                {
 +                    bFound = FALSE;
 +                    for(j=0; j<ilc->nr; j+=3)
 +                    {
 +                        if ((ilc->iatoms[j+1] == a1 && ilc->iatoms[j+2] == a2) ||
 +                            (ilc->iatoms[j+1] == a2 && ilc->iatoms[j+2] == a1))
 +                            {
 +                                bFound = TRUE;
 +                            }
 +                        }
 +                    for(j=0; j<ils->nr; j+=4)
 +                    {
 +                        if ((a1 == ils->iatoms[j+1] || a1 == ils->iatoms[j+2] || a1 == ils->iatoms[j+3]) &&
 +                            (a2 == ils->iatoms[j+1] || a2 == ils->iatoms[j+2] || a2 == ils->iatoms[j+3]))
 +                        {
 +                            bFound = TRUE;
 +                        }
 +                    }
 +                    if (!bFound &&
 +                        (w_moltype == NULL || period2 < w_period2))
 +                    {
 +                        w_moltype = moltype;
 +                        w_a1      = a1;
 +                        w_a2      = a2;
 +                        w_period2 = period2;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (w_moltype != NULL)
 +    {
 +        bWarn = (w_period2 < sqr(min_steps_warn*dt));
 +        /* A check that would recognize most water models */
 +        bWater = ((*w_moltype->atoms.atomname[0])[0] == 'O' &&
 +                  w_moltype->atoms.nr <= 5);
 +        sprintf(warn_buf,"The bond in molecule-type %s between atoms %d %s and %d %s has an estimated oscillational period of %.1e ps, which is less than %d times the time step of %.1e ps.\n"
 +                "%s",
 +                *w_moltype->name,
 +                w_a1+1,*w_moltype->atoms.atomname[w_a1],
 +                w_a2+1,*w_moltype->atoms.atomname[w_a2],
 +                sqrt(w_period2),bWarn ? min_steps_warn : min_steps_note,dt,
 +                bWater ?
 +                "Maybe you asked for fexible water." :
 +                "Maybe you forgot to change the constraints mdp option.");
 +        if (bWarn)
 +        {
 +            warning(wi,warn_buf);
 +        }
 +        else
 +        {
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +}
 +
 +static void check_vel(gmx_mtop_t *mtop,rvec v[])
 +{
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom *atom;
 +  int a;
 +
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  while (gmx_mtop_atomloop_all_next(aloop,&a,&atom)) {
 +    if (atom->ptype == eptShell ||
 +      atom->ptype == eptBond  ||
 +      atom->ptype == eptVSite) {
 +      clear_rvec(v[a]);
 +    }
 +  }
 +}
 +
 +static gmx_bool nint_ftype(gmx_mtop_t *mtop,t_molinfo *mi,int ftype)
 +{
 +  int nint,mb;
 +
 +  nint = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    nint += mtop->molblock[mb].nmol*mi[mtop->molblock[mb].type].plist[ftype].nr;
 +  }
 +
 +  return nint;
 +}
 +
 +/* This routine reorders the molecule type array
 + * in the order of use in the molblocks,
 + * unused molecule types are deleted.
 + */
 +static void renumber_moltypes(gmx_mtop_t *sys,
 +                            int *nmolinfo,t_molinfo **molinfo)
 +{
 +  int *order,norder,i;
 +  int mb,mi;
 +  t_molinfo *minew;
 +
 +  snew(order,*nmolinfo);
 +  norder = 0;
 +  for(mb=0; mb<sys->nmolblock; mb++) {
 +    for(i=0; i<norder; i++) {
 +      if (order[i] == sys->molblock[mb].type) {
 +      break;
 +      }
 +    }
 +    if (i == norder) {
 +      /* This type did not occur yet, add it */
 +      order[norder] = sys->molblock[mb].type;
 +      /* Renumber the moltype in the topology */
 +      norder++;
 +    }
 +    sys->molblock[mb].type = i;
 +  }
 +  
 +  /* We still need to reorder the molinfo structs */
 +  snew(minew,norder);
 +  for(mi=0; mi<*nmolinfo; mi++) {
 +    for(i=0; i<norder; i++) {
 +      if (order[i] == mi) {
 +      break;
 +      }
 +    }
 +    if (i == norder) {
 +      done_mi(&(*molinfo)[mi]);
 +    } else {
 +      minew[i] = (*molinfo)[mi];
 +    }
 +  }
 +  sfree(*molinfo);
 +
 +  *nmolinfo = norder;
 +  *molinfo  = minew;
 +}
 +
 +static void molinfo2mtop(int nmi,t_molinfo *mi,gmx_mtop_t *mtop)
 +{
 +  int m;
 +  gmx_moltype_t *molt;
 +
 +  mtop->nmoltype = nmi;
 +  snew(mtop->moltype,nmi);
 +  for(m=0; m<nmi; m++) {
 +    molt = &mtop->moltype[m];
 +    molt->name  = mi[m].name;
 +    molt->atoms = mi[m].atoms;
 +    /* ilists are copied later */
 +    molt->cgs   = mi[m].cgs;
 +    molt->excls = mi[m].excls;
 +  }
 +}
 +
 +static void
 +new_status(const char *topfile,const char *topppfile,const char *confin,
 +           t_gromppopts *opts,t_inputrec *ir,gmx_bool bZero,
 +           gmx_bool bGenVel,gmx_bool bVerbose,t_state *state,
 +           gpp_atomtype_t atype,gmx_mtop_t *sys,
 +           int *nmi,t_molinfo **mi,t_params plist[],
 +           int *comb,double *reppow,real *fudgeQQ,
 +           gmx_bool bMorse,
 +           warninp_t wi)
 +{
 +  t_molinfo   *molinfo=NULL;
 +  int         nmolblock;
 +  gmx_molblock_t *molblock,*molbs;
 +  t_atoms     *confat;
 +  int         mb,i,nrmols,nmismatch;
 +  char        buf[STRLEN];
 +  gmx_bool        bGB=FALSE;
 +  char        warn_buf[STRLEN];
 +
 +  init_mtop(sys);
 +
 +  /* Set gmx_boolean for GB */
 +  if(ir->implicit_solvent)
 +    bGB=TRUE;
 +  
 +  /* TOPOLOGY processing */
 +  sys->name = do_top(bVerbose,topfile,topppfile,opts,bZero,&(sys->symtab),
 +                     plist,comb,reppow,fudgeQQ,
 +                     atype,&nrmols,&molinfo,ir,
 +                     &nmolblock,&molblock,bGB,
 +                     wi);
 +  
 +  sys->nmolblock = 0;
 +  snew(sys->molblock,nmolblock);
 +  
 +  sys->natoms = 0;
 +  for(mb=0; mb<nmolblock; mb++) {
 +    if (sys->nmolblock > 0 &&
 +      molblock[mb].type == sys->molblock[sys->nmolblock-1].type) {
 +      /* Merge consecutive blocks with the same molecule type */
 +      sys->molblock[sys->nmolblock-1].nmol += molblock[mb].nmol;
 +      sys->natoms += molblock[mb].nmol*sys->molblock[sys->nmolblock-1].natoms_mol;
 +    } else if (molblock[mb].nmol > 0) {
 +      /* Add a new molblock to the topology */
 +      molbs = &sys->molblock[sys->nmolblock];
 +      *molbs = molblock[mb];
 +      molbs->natoms_mol = molinfo[molbs->type].atoms.nr;
 +      molbs->nposres_xA = 0;
 +      molbs->nposres_xB = 0;
 +      sys->natoms += molbs->nmol*molbs->natoms_mol;
 +      sys->nmolblock++;
 +    }
 +  }
 +  if (sys->nmolblock == 0) {
 +    gmx_fatal(FARGS,"No molecules were defined in the system");
 +  }
 +
 +  renumber_moltypes(sys,&nrmols,&molinfo);
 +
 +  if (bMorse)
 +    convert_harmonics(nrmols,molinfo,atype);
 +
 +  if (ir->eDisre == edrNone) {
 +    i = rm_interactions(F_DISRES,nrmols,molinfo);
 +    if (i > 0) {
 +      set_warning_line(wi,"unknown",-1);
 +      sprintf(warn_buf,"disre = no, removed %d distance restraints",i);
 +      warning_note(wi,warn_buf);
 +    }
 +  }
 +  if (opts->bOrire == FALSE) {
 +    i = rm_interactions(F_ORIRES,nrmols,molinfo);
 +    if (i > 0) {
 +      set_warning_line(wi,"unknown",-1);
 +      sprintf(warn_buf,"orire = no, removed %d orientation restraints",i);
 +      warning_note(wi,warn_buf);
 +    }
 +  }
 +  
 +  /* Copy structures from msys to sys */
 +  molinfo2mtop(nrmols,molinfo,sys);
 +
 +  gmx_mtop_finalize(sys);
 + 
 +  /* COORDINATE file processing */
 +  if (bVerbose) 
 +    fprintf(stderr,"processing coordinates...\n");
 +
 +  get_stx_coordnum(confin,&state->natoms);
 +  if (state->natoms != sys->natoms)
 +    gmx_fatal(FARGS,"number of coordinates in coordinate file (%s, %d)\n"
 +              "             does not match topology (%s, %d)",
 +            confin,state->natoms,topfile,sys->natoms);
 +  else {
 +    /* make space for coordinates and velocities */
 +    char title[STRLEN];
 +    snew(confat,1);
 +    init_t_atoms(confat,state->natoms,FALSE);
 +    init_state(state,state->natoms,0,0,0,0);
 +    read_stx_conf(confin,title,confat,state->x,state->v,NULL,state->box);
 +    /* This call fixes the box shape for runs with pressure scaling */
 +    set_box_rel(ir,state);
 +
 +    nmismatch = check_atom_names(topfile, confin, sys, confat);
 +    free_t_atoms(confat,TRUE);
 +    sfree(confat);
 +    
 +    if (nmismatch) {
 +      sprintf(buf,"%d non-matching atom name%s\n"
 +            "atom names from %s will be used\n"
 +            "atom names from %s will be ignored\n",
 +            nmismatch,(nmismatch == 1) ? "" : "s",topfile,confin);
 +      warning(wi,buf);
 +    }    
 +    if (bVerbose) 
 +      fprintf(stderr,"double-checking input for internal consistency...\n");
 +    double_check(ir,state->box,nint_ftype(sys,molinfo,F_CONSTR),wi);
 +  }
 +
 +  if (bGenVel) {
 +    real *mass;
 +    gmx_mtop_atomloop_all_t aloop;
 +    t_atom *atom;
 +
 +    snew(mass,state->natoms);
 +    aloop = gmx_mtop_atomloop_all_init(sys);
 +    while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +      mass[i] = atom->m;
 +    }
 +
 +    if (opts->seed == -1) {
 +      opts->seed = make_seed();
 +      fprintf(stderr,"Setting gen_seed to %d\n",opts->seed);
 +    }
 +    maxwell_speed(opts->tempi,opts->seed,sys,state->v);
 +
 +    stop_cm(stdout,state->natoms,mass,state->x,state->v);
 +    sfree(mass);
 +  }
 +
 +  *nmi = nrmols;
 +  *mi  = molinfo;
 +}
 +
 +static void copy_state(const char *slog,t_trxframe *fr,
 +                       gmx_bool bReadVel,t_state *state,
 +                       double *use_time)
 +{
 +    int i;
 +
 +    if (fr->not_ok & FRAME_NOT_OK)
 +    {
 +        gmx_fatal(FARGS,"Can not start from an incomplete frame");
 +    }
 +    if (!fr->bX)
 +    {
 +        gmx_fatal(FARGS,"Did not find a frame with coordinates in file %s",
 +                  slog);
 +    }
 +
 +    for(i=0; i<state->natoms; i++)
 +    {
 +        copy_rvec(fr->x[i],state->x[i]);
 +    }
 +    if (bReadVel)
 +    {
 +        if (!fr->bV)
 +        {
 +            gmx_incons("Trajecory frame unexpectedly does not contain velocities");
 +        }
 +        for(i=0; i<state->natoms; i++)
 +        {
 +            copy_rvec(fr->v[i],state->v[i]);
 +        }
 +    }
 +    if (fr->bBox)
 +    {
 +        copy_mat(fr->box,state->box);
 +    }
 +
 +    *use_time = fr->time;
 +}
 +
 +static void cont_status(const char *slog,const char *ener,
 +                      gmx_bool bNeedVel,gmx_bool bGenVel, real fr_time,
 +                      t_inputrec *ir,t_state *state,
 +                      gmx_mtop_t *sys,
 +                        const output_env_t oenv)
 +     /* If fr_time == -1 read the last frame available which is complete */
 +{
 +    gmx_bool bReadVel;
 +    t_trxframe  fr;
 +    t_trxstatus *fp;
 +    int i;
 +    double use_time;
 +
 +    bReadVel = (bNeedVel && !bGenVel);
 +
 +    fprintf(stderr,
 +            "Reading Coordinates%s and Box size from old trajectory\n",
 +            bReadVel ? ", Velocities" : "");
 +    if (fr_time == -1)
 +    {
 +        fprintf(stderr,"Will read whole trajectory\n");
 +    }
 +    else
 +    {
 +        fprintf(stderr,"Will read till time %g\n",fr_time);
 +    }
 +    if (!bReadVel)
 +    {
 +        if (bGenVel)
 +        {
 +            fprintf(stderr,"Velocities generated: "
 +                    "ignoring velocities in input trajectory\n");
 +        }
 +        read_first_frame(oenv,&fp,slog,&fr,TRX_NEED_X);
 +    }
 +    else
 +    {
 +        read_first_frame(oenv,&fp,slog,&fr,TRX_NEED_X | TRX_NEED_V);
 +        
 +        if (!fr.bV)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "WARNING: Did not find a frame with velocities in file %s,\n"
 +                    "         all velocities will be set to zero!\n\n",slog);
 +            for(i=0; i<sys->natoms; i++)
 +            {
 +                clear_rvec(state->v[i]);
 +            }
 +            close_trj(fp);
 +            /* Search for a frame without velocities */
 +            bReadVel = FALSE;
 +            read_first_frame(oenv,&fp,slog,&fr,TRX_NEED_X);
 +        }
 +    }
 +
 +    state->natoms = fr.natoms;
 +
 +    if (sys->natoms != state->natoms)
 +    {
 +        gmx_fatal(FARGS,"Number of atoms in Topology "
 +                  "is not the same as in Trajectory");
 +    }
 +    copy_state(slog,&fr,bReadVel,state,&use_time);
 +
 +    /* Find the appropriate frame */
 +    while ((fr_time == -1 || fr.time < fr_time) &&
 +           read_next_frame(oenv,fp,&fr))
 +    {
 +        copy_state(slog,&fr,bReadVel,state,&use_time);
 +    }
 +  
 +    close_trj(fp);
 +
 +    /* Set the relative box lengths for preserving the box shape.
 +     * Note that this call can lead to differences in the last bit
 +     * with respect to using tpbconv to create a [TT].tpx[tt] file.
 +     */
 +    set_box_rel(ir,state);
 +
 +    fprintf(stderr,"Using frame at t = %g ps\n",use_time);
 +    fprintf(stderr,"Starting time for run is %g ps\n",ir->init_t); 
 +  
 +    if ((ir->epc != epcNO  || ir->etc ==etcNOSEHOOVER) && ener)
 +    {
 +        get_enx_state(ener,use_time,&sys->groups,ir,state);
 +        preserve_box_shape(ir,state->box_rel,state->boxv);
 +    }
 +}
 +
 +static void read_posres(gmx_mtop_t *mtop,t_molinfo *molinfo,gmx_bool bTopB,
 +                        char *fn,
 +                        int rc_scaling, int ePBC, 
 +                        rvec com,
 +                        warninp_t wi)
 +{
 +  gmx_bool   bFirst = TRUE, *hadAtom;
 +  rvec   *x,*v,*xp;
 +  dvec   sum;
 +  double totmass;
 +  t_atoms dumat;
 +  matrix box,invbox;
 +  int    natoms,npbcdim=0;
 +  char   warn_buf[STRLEN],title[STRLEN];
 +  int    a,i,ai,j,k,mb,nat_molb;
 +  gmx_molblock_t *molb;
 +  t_params *pr,*prfb;
 +  t_atom *atom;
 +
 +  get_stx_coordnum(fn,&natoms);
 +  if (natoms != mtop->natoms) {
 +    sprintf(warn_buf,"The number of atoms in %s (%d) does not match the number of atoms in the topology (%d). Will assume that the first %d atoms in the topology and %s match.",fn,natoms,mtop->natoms,min(mtop->natoms,natoms),fn);
 +    warning(wi,warn_buf);
 +  }
 +  snew(x,natoms);
 +  snew(v,natoms);
 +  init_t_atoms(&dumat,natoms,FALSE);
 +  read_stx_conf(fn,title,&dumat,x,v,NULL,box);
 +  
 +  npbcdim = ePBC2npbcdim(ePBC);
 +  clear_rvec(com);
 +  if (rc_scaling != erscNO) {
 +    copy_mat(box,invbox);
 +    for(j=npbcdim; j<DIM; j++) {
 +      clear_rvec(invbox[j]);
 +      invbox[j][j] = 1;
 +    }
 +    m_inv_ur0(invbox,invbox);
 +  }
 +
 +  /* Copy the reference coordinates to mtop */
 +  clear_dvec(sum);
 +  totmass = 0;
 +  a = 0;
 +  snew(hadAtom,natoms);
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    nat_molb = molb->nmol*mtop->moltype[molb->type].atoms.nr;
 +    pr = &(molinfo[molb->type].plist[F_POSRES]);
 +    prfb = &(molinfo[molb->type].plist[F_FBPOSRES]);
 +    if (pr->nr > 0 || prfb->nr > 0) {
 +      atom = mtop->moltype[molb->type].atoms.atom;
 +      for(i=0; (i<pr->nr); i++) {
 +      ai=pr->param[i].AI;
 +      if (ai >= natoms) {
 +        gmx_fatal(FARGS,"Position restraint atom index (%d) in moltype '%s' is larger than number of atoms in %s (%d).\n",
 +                  ai+1,*molinfo[molb->type].name,fn,natoms);
 +      }
 +    hadAtom[ai]=TRUE;
 +      if (rc_scaling == erscCOM) {
 +        /* Determine the center of mass of the posres reference coordinates */
 +        for(j=0; j<npbcdim; j++) {
 +          sum[j] += atom[ai].m*x[a+ai][j];
 +        }
 +        totmass  += atom[ai].m;
 +      }
 +      }
 +      /* Same for flat-bottomed posres, but do not count an atom twice for COM */
 +      for(i=0; (i<prfb->nr); i++) {
 +          ai=prfb->param[i].AI;
 +          if (ai >= natoms) {
 +              gmx_fatal(FARGS,"Position restraint atom index (%d) in moltype '%s' is larger than number of atoms in %s (%d).\n",
 +                        ai+1,*molinfo[molb->type].name,fn,natoms);
 +          }
 +          if (rc_scaling == erscCOM && hadAtom[ai] == FALSE) {
 +              /* Determine the center of mass of the posres reference coordinates */
 +              for(j=0; j<npbcdim; j++) {
 +                  sum[j] += atom[ai].m*x[a+ai][j];
 +              }
 +              totmass  += atom[ai].m;
 +          }
 +      }
 +      if (!bTopB) {
 +      molb->nposres_xA = nat_molb;
 +      snew(molb->posres_xA,molb->nposres_xA);
 +      for(i=0; i<nat_molb; i++) {
 +        copy_rvec(x[a+i],molb->posres_xA[i]);
 +      }
 +      } else {
 +      molb->nposres_xB = nat_molb;
 +      snew(molb->posres_xB,molb->nposres_xB);
 +      for(i=0; i<nat_molb; i++) {
 +        copy_rvec(x[a+i],molb->posres_xB[i]);
 +      }
 +      }
 +    }
 +    a += nat_molb;
 +  }
 +  if (rc_scaling == erscCOM) {
 +    if (totmass == 0)
 +      gmx_fatal(FARGS,"The total mass of the position restraint atoms is 0");
 +    for(j=0; j<npbcdim; j++)
 +      com[j] = sum[j]/totmass;
 +    fprintf(stderr,"The center of mass of the position restraint coord's is %6.3f %6.3f %6.3f\n",com[XX],com[YY],com[ZZ]);
 +  }
 +
 +  if (rc_scaling != erscNO) {
 +    for(mb=0; mb<mtop->nmolblock; mb++) {
 +      molb = &mtop->molblock[mb];
 +      nat_molb = molb->nmol*mtop->moltype[molb->type].atoms.nr;
 +      if (molb->nposres_xA > 0 || molb->nposres_xB > 0) {
 +      xp = (!bTopB ? molb->posres_xA : molb->posres_xB);
 +      for(i=0; i<nat_molb; i++) {
 +        for(j=0; j<npbcdim; j++) {
 +          if (rc_scaling == erscALL) {
 +            /* Convert from Cartesian to crystal coordinates */
 +            xp[i][j] *= invbox[j][j];
 +            for(k=j+1; k<npbcdim; k++) {
 +              xp[i][j] += invbox[k][j]*xp[i][k];
 +            }
 +          } else if (rc_scaling == erscCOM) {
 +            /* Subtract the center of mass */
 +            xp[i][j] -= com[j];
 +          }
 +        }
 +      }
 +      }
 +    }
 +
 +    if (rc_scaling == erscCOM) {
 +      /* Convert the COM from Cartesian to crystal coordinates */
 +      for(j=0; j<npbcdim; j++) {
 +      com[j] *= invbox[j][j];
 +      for(k=j+1; k<npbcdim; k++) {
 +        com[j] += invbox[k][j]*com[k];
 +      }
 +      }
 +    }
 +  }
 +  
 +  free_t_atoms(&dumat,TRUE);
 +  sfree(x);
 +  sfree(v);
 +  sfree(hadAtom);
 +}
 +
 +static void gen_posres(gmx_mtop_t *mtop,t_molinfo *mi,
 +                       char *fnA, char *fnB,
 +                       int rc_scaling, int ePBC,
 +                       rvec com, rvec comB,
 +                       warninp_t wi)
 +{
 +  int i,j;
 +
 +  read_posres  (mtop,mi,FALSE,fnA,rc_scaling,ePBC,com,wi);
 +  if (strcmp(fnA,fnB) != 0) {
 +      read_posres(mtop,mi,TRUE ,fnB,rc_scaling,ePBC,comB,wi);
 +  }
 +}
 +
 +static void set_wall_atomtype(gpp_atomtype_t at,t_gromppopts *opts,
 +                              t_inputrec *ir,warninp_t wi)
 +{
 +  int i;
 +  char warn_buf[STRLEN];
 +
 +  if (ir->nwall > 0)
 +  {
 +      fprintf(stderr,"Searching the wall atom type(s)\n");
 +  }
 +  for(i=0; i<ir->nwall; i++)
 +  {
 +      ir->wall_atomtype[i] = get_atomtype_type(opts->wall_atomtype[i],at);
 +      if (ir->wall_atomtype[i] == NOTSET)
 +      {
 +          sprintf(warn_buf,"Specified wall atom type %s is not defined",opts->wall_atomtype[i]);
 +          warning_error(wi,warn_buf);
 +      }
 +  }
 +}
 +
 +static int nrdf_internal(t_atoms *atoms)
 +{
 +  int i,nmass,nrdf;
 +
 +  nmass = 0;
 +  for(i=0; i<atoms->nr; i++) {
 +    /* Vsite ptype might not be set here yet, so also check the mass */
 +    if ((atoms->atom[i].ptype == eptAtom ||
 +       atoms->atom[i].ptype == eptNucleus)
 +      && atoms->atom[i].m > 0) {
 +      nmass++;
 +    }
 +  }
 +  switch (nmass) {
 +  case 0:  nrdf = 0; break;
 +  case 1:  nrdf = 0; break;
 +  case 2:  nrdf = 1; break;
 +  default: nrdf = nmass*3 - 6; break;
 +  }
 +  
 +  return nrdf;
 +}
 +
 +void
 +spline1d( double        dx,
 +               double *      y,
 +               int           n,
 +               double *      u,
 +               double *      y2 )
 +{
 +    int i;
 +    double p,q;
 +      
 +    y2[0] = 0.0;
 +    u[0]  = 0.0;
 +      
 +    for(i=1;i<n-1;i++)
 +    {
 +              p = 0.5*y2[i-1]+2.0;
 +        y2[i] = -0.5/p;
 +        q = (y[i+1]-2.0*y[i]+y[i-1])/dx;
 +              u[i] = (3.0*q/dx-0.5*u[i-1])/p;
 +    }
 +      
 +    y2[n-1] = 0.0;
 +      
 +    for(i=n-2;i>=0;i--)
 +    {
 +        y2[i] = y2[i]*y2[i+1]+u[i];
 +    }
 +}
 +
 +
 +void
 +interpolate1d( double     xmin,
 +                        double     dx,
 +                        double *   ya,
 +                        double *   y2a,
 +                        double     x,
 +                        double *   y,
 +                        double *   y1)
 +{
 +    int ix;
 +    double a,b;
 +      
 +    ix = (x-xmin)/dx;
 +      
 +    a = (xmin+(ix+1)*dx-x)/dx;
 +    b = (x-xmin-ix*dx)/dx;
 +      
 +    *y  = a*ya[ix]+b*ya[ix+1]+((a*a*a-a)*y2a[ix]+(b*b*b-b)*y2a[ix+1])*(dx*dx)/6.0;
 +    *y1 = (ya[ix+1]-ya[ix])/dx-(3.0*a*a-1.0)/6.0*dx*y2a[ix]+(3.0*b*b-1.0)/6.0*dx*y2a[ix+1];
 +}
 +
 +
 +void
 +setup_cmap (int              grid_spacing,
 +                      int              nc,
 +                      real *           grid ,
 +                      gmx_cmap_t *     cmap_grid)
 +{
 +      double *tmp_u,*tmp_u2,*tmp_yy,*tmp_y1,*tmp_t2,*tmp_grid;
 +      
 +    int    i,j,k,ii,jj,kk,idx;
 +      int    offset;
 +    double dx,xmin,v,v1,v2,v12;
 +    double phi,psi;
 +      
 +      snew(tmp_u,2*grid_spacing);
 +      snew(tmp_u2,2*grid_spacing);
 +      snew(tmp_yy,2*grid_spacing);
 +      snew(tmp_y1,2*grid_spacing);
 +      snew(tmp_t2,2*grid_spacing*2*grid_spacing);
 +      snew(tmp_grid,2*grid_spacing*2*grid_spacing);
 +      
 +    dx = 360.0/grid_spacing;
 +    xmin = -180.0-dx*grid_spacing/2;
 +      
 +      for(kk=0;kk<nc;kk++)
 +      {
 +              /* Compute an offset depending on which cmap we are using 
 +               * Offset will be the map number multiplied with the 
 +                 * grid_spacing * grid_spacing * 2
 +               */
 +              offset = kk * grid_spacing * grid_spacing * 2;
 +              
 +              for(i=0;i<2*grid_spacing;i++)
 +              {
 +                      ii=(i+grid_spacing-grid_spacing/2)%grid_spacing;
 +                      
 +                      for(j=0;j<2*grid_spacing;j++)
 +                      {
 +                              jj=(j+grid_spacing-grid_spacing/2)%grid_spacing;
 +                              tmp_grid[i*grid_spacing*2+j] = grid[offset+ii*grid_spacing+jj];
 +                      }
 +              }
 +              
 +              for(i=0;i<2*grid_spacing;i++)
 +              {
 +                      spline1d(dx,&(tmp_grid[2*grid_spacing*i]),2*grid_spacing,tmp_u,&(tmp_t2[2*grid_spacing*i]));
 +              }
 +              
 +              for(i=grid_spacing/2;i<grid_spacing+grid_spacing/2;i++)
 +              {
 +                      ii = i-grid_spacing/2;
 +                      phi = ii*dx-180.0;
 +                      
 +                      for(j=grid_spacing/2;j<grid_spacing+grid_spacing/2;j++)
 +                      {
 +                              jj = j-grid_spacing/2;
 +                              psi = jj*dx-180.0;
 +                              
 +                              for(k=0;k<2*grid_spacing;k++)
 +                              {
 +                                      interpolate1d(xmin,dx,&(tmp_grid[2*grid_spacing*k]),
 +                                                                &(tmp_t2[2*grid_spacing*k]),psi,&tmp_yy[k],&tmp_y1[k]);
 +                              }
 +                              
 +                              spline1d(dx,tmp_yy,2*grid_spacing,tmp_u,tmp_u2);
 +                              interpolate1d(xmin,dx,tmp_yy,tmp_u2,phi,&v,&v1);
 +                              spline1d(dx,tmp_y1,2*grid_spacing,tmp_u,tmp_u2);
 +                              interpolate1d(xmin,dx,tmp_y1,tmp_u2,phi,&v2,&v12);
 +                              
 +                              idx = ii*grid_spacing+jj;
 +                              cmap_grid->cmapdata[kk].cmap[idx*4] = grid[offset+ii*grid_spacing+jj];
 +                              cmap_grid->cmapdata[kk].cmap[idx*4+1] = v1;
 +                              cmap_grid->cmapdata[kk].cmap[idx*4+2] = v2;
 +                              cmap_grid->cmapdata[kk].cmap[idx*4+3] = v12;
 +                      }
 +              }
 +      }
 +}                             
 +                              
 +void init_cmap_grid(gmx_cmap_t *cmap_grid, int ngrid, int grid_spacing)
 +{
 +      int i,k,nelem;
 +      
 +      cmap_grid->ngrid        = ngrid;
 +      cmap_grid->grid_spacing = grid_spacing;
 +      nelem                   = cmap_grid->grid_spacing*cmap_grid->grid_spacing;
 +      
 +      snew(cmap_grid->cmapdata,ngrid);
 +      
 +      for(i=0;i<cmap_grid->ngrid;i++)
 +      {
 +              snew(cmap_grid->cmapdata[i].cmap,4*nelem);
 +      }
 +}
 +
 +
 +static int count_constraints(gmx_mtop_t *mtop,t_molinfo *mi,warninp_t wi)
 +{
 +  int count,count_mol,i,mb;
 +  gmx_molblock_t *molb;
 +  t_params *plist;
 +  char buf[STRLEN];
 +
 +  count = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    count_mol = 0;
 +    molb  = &mtop->molblock[mb];
 +    plist = mi[molb->type].plist;
 +      
 +    for(i=0; i<F_NRE; i++) {
 +      if (i == F_SETTLE)
 +      count_mol += 3*plist[i].nr;
 +      else if (interaction_function[i].flags & IF_CONSTRAINT)
 +      count_mol += plist[i].nr;
 +    }
 +      
 +    if (count_mol > nrdf_internal(&mi[molb->type].atoms)) {
 +      sprintf(buf,
 +            "Molecule type '%s' has %d constraints.\n"
 +            "For stability and efficiency there should not be more constraints than internal number of degrees of freedom: %d.\n",
 +            *mi[molb->type].name,count_mol,
 +            nrdf_internal(&mi[molb->type].atoms));
 +      warning(wi,buf);
 +    }
 +    count += molb->nmol*count_mol;
 +  }
 +
 +  return count;
 +}
 +
 +static void check_gbsa_params_charged(gmx_mtop_t *sys, gpp_atomtype_t atype)
 +{
 +    int i,nmiss,natoms,mt;
 +    real q;
 +    const t_atoms *atoms;
 +  
 +    nmiss = 0;
 +    for(mt=0;mt<sys->nmoltype;mt++)
 +    {
 +        atoms  = &sys->moltype[mt].atoms;
 +        natoms = atoms->nr;
 +
 +        for(i=0;i<natoms;i++)
 +        {
 +            q = atoms->atom[i].q;
 +            if ((get_atomtype_radius(atoms->atom[i].type,atype)    == 0  ||
 +                 get_atomtype_vol(atoms->atom[i].type,atype)       == 0  ||
 +                 get_atomtype_surftens(atoms->atom[i].type,atype)  == 0  ||
 +                 get_atomtype_gb_radius(atoms->atom[i].type,atype) == 0  ||
 +                 get_atomtype_S_hct(atoms->atom[i].type,atype)     == 0) &&
 +                q != 0)
 +            {
 +                fprintf(stderr,"\nGB parameter(s) zero for atom type '%s' while charge is %g\n",
 +                        get_atomtype_name(atoms->atom[i].type,atype),q);
 +                nmiss++;
 +            }
 +        }
 +    }
 +
 +    if (nmiss > 0)
 +    {
 +        gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield has parameters with value zero for %d atomtypes that occur as charged atoms.",nmiss);
 +    }
 +}
 +
 +
 +static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
 +{
 +    int  nmiss,i;
 +
 +    /* If we are doing GBSA, check that we got the parameters we need
 +     * This checking is to see if there are GBSA paratmeters for all
 +     * atoms in the force field. To go around this for testing purposes
 +     * comment out the nerror++ counter temporarily
 +     */
 +    nmiss = 0;
 +    for(i=0;i<get_atomtype_ntypes(atype);i++)
 +    {
 +        if (get_atomtype_radius(i,atype)    < 0 ||
 +            get_atomtype_vol(i,atype)       < 0 ||
 +            get_atomtype_surftens(i,atype)  < 0 ||
 +            get_atomtype_gb_radius(i,atype) < 0 ||
 +            get_atomtype_S_hct(i,atype)     < 0)
 +        {
 +            fprintf(stderr,"\nGB parameter(s) missing or negative for atom type '%s'\n",
 +                    get_atomtype_name(i,atype));
 +            nmiss++;
 +        }
 +    }
 +    
 +    if (nmiss > 0)
 +    {
 +        gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield is missing parameters for %d atomtypes or they might be negative.",nmiss);
 +    }
 +  
 +}
 +
 +static void set_verlet_buffer(const gmx_mtop_t *mtop,
 +                              t_inputrec *ir,
 +                              matrix box,
 +                              real verletbuf_drift,
 +                              warninp_t wi)
 +{
 +    real ref_T;
 +    int i;
 +    verletbuf_list_setup_t ls;
 +    real rlist_1x1;
 +    int n_nonlin_vsite;
 +    char warn_buf[STRLEN];
 +
 +    ref_T = 0;
 +    for(i=0; i<ir->opts.ngtc; i++)
 +    {
 +        if (ir->opts.ref_t[i] < 0)
 +        {
 +            warning(wi,"Some atom groups do not use temperature coupling. This cannot be accounted for in the energy drift estimation for the Verlet buffer size. The energy drift and the Verlet buffer might be underestimated.");
 +        }
 +        else
 +        {
 +            ref_T = max(ref_T,ir->opts.ref_t[i]);
 +        }
 +    }
 +
 +    printf("Determining Verlet buffer for an energy drift of %g kJ/mol/ps at %g K\n",verletbuf_drift,ref_T);
 +
 +    for(i=0; i<ir->opts.ngtc; i++)
 +    {
 +        if (ir->opts.ref_t[i] >= 0 && ir->opts.ref_t[i] != ref_T)
 +        {
 +            sprintf(warn_buf,"ref_T for group of %.1f DOFs is %g K, which is smaller than the maximum of %g K used for the buffer size calculation. The buffer size might be on the conservative (large) side.",
 +                    ir->opts.nrdf[i],ir->opts.ref_t[i],ref_T);
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +
 +    /* Calculate the buffer size for simple atom vs atoms list */
 +    ls.cluster_size_i = 1;
 +    ls.cluster_size_j = 1;
 +    calc_verlet_buffer_size(mtop,det(box),ir,verletbuf_drift,
 +                            &ls,&n_nonlin_vsite,&rlist_1x1);
 +
 +    /* Set the pair-list buffer size in ir */
 +    verletbuf_get_list_setup(FALSE,&ls);
 +    calc_verlet_buffer_size(mtop,det(box),ir,verletbuf_drift,
 +                            &ls,&n_nonlin_vsite,&ir->rlist);
 +
 +    if (n_nonlin_vsite > 0)
 +    {
 +        sprintf(warn_buf,"There are %d non-linear virtual site constructions. Their contribution to the energy drift is approximated. In most cases this does not affect the energy drift significantly.",n_nonlin_vsite);
 +        warning_note(wi,warn_buf);
 +    }
 +
 +    printf("Calculated rlist for %dx%d atom pair-list as %.3f nm, buffer size %.3f nm\n",
 +           1,1,rlist_1x1,rlist_1x1-max(ir->rvdw,ir->rcoulomb));
 +
 +    ir->rlistlong = ir->rlist;
 +    printf("Set rlist, assuming %dx%d atom pair-list, to %.3f nm, buffer size %.3f nm\n",
 +           ls.cluster_size_i,ls.cluster_size_j,
 +           ir->rlist,ir->rlist-max(ir->rvdw,ir->rcoulomb));
 +            
 +    if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC,box))
 +    {
 +        gmx_fatal(FARGS,"The pair-list cut-off (%g nm) is longer than half the shortest box vector or longer than the smallest box diagonal element (%g nm). Increase the box size or decrease nstlist or increase verlet-buffer-drift.",ir->rlistlong,sqrt(max_cutoff2(ir->ePBC,box)));
 +    }
 +}
 +
 +int cmain (int argc, char *argv[])
 +{
 +  static const char *desc[] = {
 +    "The gromacs preprocessor",
 +    "reads a molecular topology file, checks the validity of the",
 +    "file, expands the topology from a molecular description to an atomic",
 +    "description. The topology file contains information about",
 +    "molecule types and the number of molecules, the preprocessor",
 +    "copies each molecule as needed. ",
 +    "There is no limitation on the number of molecule types. ",
 +    "Bonds and bond-angles can be converted into constraints, separately",
 +    "for hydrogens and heavy atoms.",
 +    "Then a coordinate file is read and velocities can be generated",
 +    "from a Maxwellian distribution if requested.",
 +    "[TT]grompp[tt] also reads parameters for the [TT]mdrun[tt] ",
 +    "(eg. number of MD steps, time step, cut-off), and others such as",
 +    "NEMD parameters, which are corrected so that the net acceleration",
 +    "is zero.",
 +    "Eventually a binary file is produced that can serve as the sole input",
 +    "file for the MD program.[PAR]",
 +    
 +    "[TT]grompp[tt] uses the atom names from the topology file. The atom names",
 +    "in the coordinate file (option [TT]-c[tt]) are only read to generate",
 +    "warnings when they do not match the atom names in the topology.",
 +    "Note that the atom names are irrelevant for the simulation as",
 +    "only the atom types are used for generating interaction parameters.[PAR]",
 +
 +    "[TT]grompp[tt] uses a built-in preprocessor to resolve includes, macros, ",
 +    "etc. The preprocessor supports the following keywords:[PAR]",
 +    "#ifdef VARIABLE[BR]",
 +    "#ifndef VARIABLE[BR]",
 +    "#else[BR]",
 +    "#endif[BR]",
 +    "#define VARIABLE[BR]",
 +    "#undef VARIABLE[BR]"
 +    "#include \"filename\"[BR]",
 +    "#include <filename>[PAR]",
 +    "The functioning of these statements in your topology may be modulated by",
 +    "using the following two flags in your [TT].mdp[tt] file:[PAR]",
 +    "[TT]define = -DVARIABLE1 -DVARIABLE2[BR]",
 +    "include = -I/home/john/doe[tt][BR]",
 +    "For further information a C-programming textbook may help you out.",
 +    "Specifying the [TT]-pp[tt] flag will get the pre-processed",
 +    "topology file written out so that you can verify its contents.[PAR]",
 +   
 +    /* cpp has been unnecessary for some time, hasn't it?
 +        "If your system does not have a C-preprocessor, you can still",
 +        "use [TT]grompp[tt], but you do not have access to the features ",
 +        "from the cpp. Command line options to the C-preprocessor can be given",
 +        "in the [TT].mdp[tt] file. See your local manual (man cpp).[PAR]",
 +    */
 +    
 +    "When using position restraints a file with restraint coordinates",
 +    "can be supplied with [TT]-r[tt], otherwise restraining will be done",
 +    "with respect to the conformation from the [TT]-c[tt] option.",
 +    "For free energy calculation the the coordinates for the B topology",
 +    "can be supplied with [TT]-rb[tt], otherwise they will be equal to",
 +    "those of the A topology.[PAR]",
 +    
 +    "Starting coordinates can be read from trajectory with [TT]-t[tt].",
 +    "The last frame with coordinates and velocities will be read,",
 +    "unless the [TT]-time[tt] option is used. Only if this information",
 +    "is absent will the coordinates in the [TT]-c[tt] file be used.",
 +    "Note that these velocities will not be used when [TT]gen_vel = yes[tt]",
 +    "in your [TT].mdp[tt] file. An energy file can be supplied with",
 +    "[TT]-e[tt] to read Nose-Hoover and/or Parrinello-Rahman coupling",
 +    "variables.[PAR]",
 +
 +    "[TT]grompp[tt] can be used to restart simulations (preserving",
 +    "continuity) by supplying just a checkpoint file with [TT]-t[tt].",
 +    "However, for simply changing the number of run steps to extend",
 +    "a run, using [TT]tpbconv[tt] is more convenient than [TT]grompp[tt].",
 +    "You then supply the old checkpoint file directly to [TT]mdrun[tt]",
 +    "with [TT]-cpi[tt]. If you wish to change the ensemble or things",
 +    "like output frequency, then supplying the checkpoint file to",
 +    "[TT]grompp[tt] with [TT]-t[tt] along with a new [TT].mdp[tt] file",
 +    "with [TT]-f[tt] is the recommended procedure.[PAR]",
 +
 +    "By default, all bonded interactions which have constant energy due to",
 +    "virtual site constructions will be removed. If this constant energy is",
 +    "not zero, this will result in a shift in the total energy. All bonded",
 +    "interactions can be kept by turning off [TT]-rmvsbds[tt]. Additionally,",
 +    "all constraints for distances which will be constant anyway because",
 +    "of virtual site constructions will be removed. If any constraints remain",
 +    "which involve virtual sites, a fatal error will result.[PAR]"
 +    
 +    "To verify your run input file, please take note of all warnings",
 +    "on the screen, and correct where necessary. Do also look at the contents",
 +    "of the [TT]mdout.mdp[tt] file; this contains comment lines, as well as",
 +    "the input that [TT]grompp[tt] has read. If in doubt, you can start [TT]grompp[tt]",
 +    "with the [TT]-debug[tt] option which will give you more information",
 +    "in a file called [TT]grompp.log[tt] (along with real debug info). You",
 +    "can see the contents of the run input file with the [TT]gmxdump[tt]",
 +    "program. [TT]gmxcheck[tt] can be used to compare the contents of two",
 +    "run input files.[PAR]"
 +
 +    "The [TT]-maxwarn[tt] option can be used to override warnings printed",
 +    "by [TT]grompp[tt] that otherwise halt output. In some cases, warnings are",
 +    "harmless, but usually they are not. The user is advised to carefully",
 +    "interpret the output messages before attempting to bypass them with",
 +    "this option."
 +  };
 +  t_gromppopts *opts;
 +  gmx_mtop_t   *sys;
 +  int          nmi;
 +  t_molinfo    *mi;
 +  gpp_atomtype_t atype;
 +  t_inputrec   *ir;
 +  int          natoms,nvsite,comb,mt;
 +  t_params     *plist;
 +  t_state      state;
 +  matrix       box;
 +  real         max_spacing,fudgeQQ;
 +  double       reppow;
 +  char         fn[STRLEN],fnB[STRLEN];
 +  const char   *mdparin;
 +  int          ntype;
 +  gmx_bool         bNeedVel,bGenVel;
 +  gmx_bool         have_atomnumber;
 +  int            n12,n13,n14;
 +  t_params     *gb_plist = NULL;
 +  gmx_genborn_t *born = NULL;
 +  output_env_t oenv;
 +  gmx_bool         bVerbose = FALSE;
 +  warninp_t    wi;
 +  char         warn_buf[STRLEN];
 +
 +  t_filenm fnm[] = {
 +    { efMDP, NULL,  NULL,        ffREAD  },
 +    { efMDP, "-po", "mdout",     ffWRITE },
 +    { efSTX, "-c",  NULL,        ffREAD  },
 +    { efSTX, "-r",  NULL,        ffOPTRD },
 +    { efSTX, "-rb", NULL,        ffOPTRD },
 +    { efNDX, NULL,  NULL,        ffOPTRD },
 +    { efTOP, NULL,  NULL,        ffREAD  },
 +    { efTOP, "-pp", "processed", ffOPTWR },
 +    { efTPX, "-o",  NULL,        ffWRITE },
 +    { efTRN, "-t",  NULL,        ffOPTRD },
 +    { efEDR, "-e",  NULL,        ffOPTRD },
 +    { efTRN, "-ref","rotref",    ffOPTRW }
 +  };
 +#define NFILE asize(fnm)
 +
 +  /* Command line options */
 +  static gmx_bool bRenum=TRUE;
 +  static gmx_bool bRmVSBds=TRUE,bZero=FALSE;
 +  static int  i,maxwarn=0;
 +  static real fr_time=-1;
 +  t_pargs pa[] = {
 +    { "-v",       FALSE, etBOOL,{&bVerbose},  
 +      "Be loud and noisy" },
 +    { "-time",    FALSE, etREAL, {&fr_time},
 +      "Take frame at or first after this time." },
 +    { "-rmvsbds",FALSE, etBOOL, {&bRmVSBds},
 +      "Remove constant bonded interactions with virtual sites" },
 +    { "-maxwarn", FALSE, etINT,  {&maxwarn},
 +      "Number of allowed warnings during input processing. Not for normal use and may generate unstable systems" },
 +    { "-zero",    FALSE, etBOOL, {&bZero},
 +      "Set parameters for bonded interactions without defaults to zero instead of generating an error" },
 +    { "-renum",   FALSE, etBOOL, {&bRenum},
 +      "Renumber atomtypes and minimize number of atomtypes" }
 +  };
 +  
 +  CopyRight(stderr,argv[0]);
 +  
 +  /* Initiate some variables */
 +  snew(ir,1);
 +  snew(opts,1);
 +  init_ir(ir,opts);
 +  
 +  /* Parse the command line */
 +  parse_common_args(&argc,argv,0,NFILE,fnm,asize(pa),pa,
 +                    asize(desc),desc,0,NULL,&oenv);
 +  
 +  wi = init_warning(TRUE,maxwarn);
 +  
 +  /* PARAMETER file processing */
 +  mdparin = opt2fn("-f",NFILE,fnm);
 +  set_warning_line(wi,mdparin,-1);    
 +  get_ir(mdparin,opt2fn("-po",NFILE,fnm),ir,opts,wi);
 +  
 +  if (bVerbose) 
 +    fprintf(stderr,"checking input for internal consistency...\n");
 +  check_ir(mdparin,ir,opts,wi);
 +
 +  if (ir->ld_seed == -1) {
 +    ir->ld_seed = make_seed();
 +    fprintf(stderr,"Setting the LD random seed to %d\n",ir->ld_seed);
 +  }
 +
 +  if (ir->expandedvals->lmc_seed == -1) {
 +    ir->expandedvals->lmc_seed = make_seed();
 +    fprintf(stderr,"Setting the lambda MC random seed to %d\n",ir->expandedvals->lmc_seed);
 +  }
 +
 +  bNeedVel = EI_STATE_VELOCITY(ir->eI);
 +  bGenVel  = (bNeedVel && opts->bGenVel);
 +  if (bGenVel && ir->bContinuation)
 +  {
 +      sprintf(warn_buf,
 +              "Generating velocities is inconsistent with attempting "
 +              "to continue a previous run. Choose only one of "
 +              "gen-vel = yes and continuation = yes.");
 +      warning_error(wi, warn_buf);
 +  }
 +
 +  snew(plist,F_NRE);
 +  init_plist(plist);
 +  snew(sys,1);
 +  atype = init_atomtype();
 +  if (debug)
 +    pr_symtab(debug,0,"Just opened",&sys->symtab);
 +    
 +  strcpy(fn,ftp2fn(efTOP,NFILE,fnm));
 +  if (!gmx_fexist(fn)) 
 +    gmx_fatal(FARGS,"%s does not exist",fn);
 +  new_status(fn,opt2fn_null("-pp",NFILE,fnm),opt2fn("-c",NFILE,fnm),
 +           opts,ir,bZero,bGenVel,bVerbose,&state,
 +           atype,sys,&nmi,&mi,plist,&comb,&reppow,&fudgeQQ,
 +           opts->bMorse,
 +           wi);
 +  
 +  if (debug)
 +    pr_symtab(debug,0,"After new_status",&sys->symtab);
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        fprintf(stderr,"Removing all charge groups because cutoff-scheme=%s\n",
 +                ecutscheme_names[ir->cutoff_scheme]);
 +
 +        /* Remove all charge groups */
 +        gmx_mtop_remove_chargegroups(sys);
 +    }
 +  
 +  if (count_constraints(sys,mi,wi) && (ir->eConstrAlg == econtSHAKE)) {
 +    if (ir->eI == eiCG || ir->eI == eiLBFGS) {
 +        sprintf(warn_buf,"Can not do %s with %s, use %s",
 +                EI(ir->eI),econstr_names[econtSHAKE],econstr_names[econtLINCS]);
 +        warning_error(wi,warn_buf);
 +    }
 +    if (ir->bPeriodicMols) {
 +        sprintf(warn_buf,"Can not do periodic molecules with %s, use %s",
 +                econstr_names[econtSHAKE],econstr_names[econtLINCS]);
 +        warning_error(wi,warn_buf);
 +    }
 +  }
 +
 +  if ( EI_SD (ir->eI) &&  ir->etc != etcNO ) {
 +      warning_note(wi,"Temperature coupling is ignored with SD integrators.");
 +  }
 +
 +  /* If we are doing QM/MM, check that we got the atom numbers */
 +  have_atomnumber = TRUE;
 +  for (i=0; i<get_atomtype_ntypes(atype); i++) {
 +    have_atomnumber = have_atomnumber && (get_atomtype_atomnumber(i,atype) >= 0);
 +  }
 +  if (!have_atomnumber && ir->bQMMM)
 +  {
 +      warning_error(wi,
 +                    "\n"
 +                    "It appears as if you are trying to run a QM/MM calculation, but the force\n"
 +                    "field you are using does not contain atom numbers fields. This is an\n"
 +                    "optional field (introduced in Gromacs 3.3) for general runs, but mandatory\n"
 +                    "for QM/MM. The good news is that it is easy to add - put the atom number as\n"
 +                    "an integer just before the mass column in ffXXXnb.itp.\n"
 +                    "NB: United atoms have the same atom numbers as normal ones.\n\n"); 
 +  }
 +
 +  if (ir->bAdress) {
 +    if ((ir->adress->const_wf>1) || (ir->adress->const_wf<0)) {
 +      warning_error(wi,"AdResS contant weighting function should be between 0 and 1\n\n");
 +    }
 +    /** \TODO check size of ex+hy width against box size */
 +  }
 + 
 +  /* Check for errors in the input now, since they might cause problems
 +   * during processing further down.
 +   */
 +  check_warning_error(wi,FARGS);
 +
 +  if (opt2bSet("-r",NFILE,fnm))
 +    sprintf(fn,"%s",opt2fn("-r",NFILE,fnm));
 +  else
 +    sprintf(fn,"%s",opt2fn("-c",NFILE,fnm));
 +  if (opt2bSet("-rb",NFILE,fnm))
 +    sprintf(fnB,"%s",opt2fn("-rb",NFILE,fnm));
 +  else
 +    strcpy(fnB,fn);
 +
 +    if (nint_ftype(sys,mi,F_POSRES) > 0 || nint_ftype(sys,mi,F_FBPOSRES) > 0)
 +    {
 +        if (bVerbose)
 +        {
 +            fprintf(stderr,"Reading position restraint coords from %s",fn);
 +            if (strcmp(fn,fnB) == 0)
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            else
 +            {
 +                fprintf(stderr," and %s\n",fnB);
 +            }
 +        }
 +        gen_posres(sys,mi,fn,fnB,
 +                   ir->refcoord_scaling,ir->ePBC,
 +                   ir->posres_com,ir->posres_comB,
 +                   wi);
 +    }
 +              
 +  nvsite = 0;
 +  /* set parameters for virtual site construction (not for vsiten) */
 +  for(mt=0; mt<sys->nmoltype; mt++) {
 +    nvsite +=
 +      set_vsites(bVerbose, &sys->moltype[mt].atoms, atype, mi[mt].plist);
 +  }
 +  /* now throw away all obsolete bonds, angles and dihedrals: */
 +  /* note: constraints are ALWAYS removed */
 +  if (nvsite) {
 +    for(mt=0; mt<sys->nmoltype; mt++) {
 +      clean_vsite_bondeds(mi[mt].plist,sys->moltype[mt].atoms.nr,bRmVSBds);
 +    }
 +  }
 +  
 +      /* If we are using CMAP, setup the pre-interpolation grid */
 +      if(plist->ncmap>0)
 +      {
 +              init_cmap_grid(&sys->ffparams.cmap_grid, plist->nc, plist->grid_spacing);
 +              setup_cmap(plist->grid_spacing, plist->nc, plist->cmap,&sys->ffparams.cmap_grid);
 +      }
 +      
 +    set_wall_atomtype(atype,opts,ir,wi);
 +  if (bRenum) {
 +    renum_atype(plist, sys, ir->wall_atomtype, atype, bVerbose);
 +    ntype = get_atomtype_ntypes(atype);
 +  }
 +
 +    if (ir->implicit_solvent != eisNO)
 +    {
 +        /* Now we have renumbered the atom types, we can check the GBSA params */
 +        check_gbsa_params(ir,atype);
 +      
 +      /* Check that all atoms that have charge and/or LJ-parameters also have 
 +       * sensible GB-parameters
 +       */
 +      check_gbsa_params_charged(sys,atype);
 +    }
 +
 +      /* PELA: Copy the atomtype data to the topology atomtype list */
 +      copy_atomtype_atomtypes(atype,&(sys->atomtypes));
 +
 +      if (debug)
 +    pr_symtab(debug,0,"After renum_atype",&sys->symtab);
 +
 +  if (bVerbose) 
 +    fprintf(stderr,"converting bonded parameters...\n");
 +      
 +  ntype = get_atomtype_ntypes(atype);
 +  convert_params(ntype, plist, mi, comb, reppow, fudgeQQ, sys);
 +      
 +  if (debug)
 +    pr_symtab(debug,0,"After convert_params",&sys->symtab);
 +
 +  /* set ptype to VSite for virtual sites */
 +  for(mt=0; mt<sys->nmoltype; mt++) {
 +    set_vsites_ptype(FALSE,&sys->moltype[mt]);
 +  }
 +  if (debug) {
 +    pr_symtab(debug,0,"After virtual sites",&sys->symtab);
 +  }
 +  /* Check velocity for virtual sites and shells */
 +  if (bGenVel) {
 +    check_vel(sys,state.v);
 +  }
 +    
 +  /* check masses */
 +  check_mol(sys,wi);
 +  
 +  for(i=0; i<sys->nmoltype; i++) {
 +      check_cg_sizes(ftp2fn(efTOP,NFILE,fnm),&sys->moltype[i].cgs,wi);
 +  }
 +
 +  if (EI_DYNAMICS(ir->eI) && ir->eI != eiBD)
 +  {
 +      check_bonds_timestep(sys,ir->delta_t,wi);
 +  }
 +
 +  if (EI_ENERGY_MINIMIZATION(ir->eI) && 0 == ir->nsteps)
 +  {
 +      warning_note(wi,"Zero-step energy minimization will alter the coordinates before calculating the energy. If you just want the energy of a single point, try zero-step MD (with unconstrained_start = yes). To do multiple single-point energy evaluations of different configurations of the same topology, use mdrun -rerun.");
 +  }
 +
 +  check_warning_error(wi,FARGS);
 +      
 +  if (bVerbose) 
 +    fprintf(stderr,"initialising group options...\n");
 +  do_index(mdparin,ftp2fn_null(efNDX,NFILE,fnm),
 +           sys,bVerbose,ir,
 +           bGenVel ? state.v : NULL,
 +           wi);
 +  
 +    if (ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0 &&
 +        ir->nstlist > 1)
 +    {
 +        if (EI_DYNAMICS(ir->eI) &&
 +            !(EI_MD(ir->eI) && ir->etc==etcNO) &&
 +            inputrec2nboundeddim(ir) == 3)
 +        {
 +            set_verlet_buffer(sys,ir,state.box,ir->verletbuf_drift,wi);
 +        }
 +    }
 +
 +  /* Init the temperature coupling state */
 +  init_gtc_state(&state,ir->opts.ngtc,0,ir->opts.nhchainlength); /* need to add nnhpres here? */
 +
 +  if (bVerbose)
 +    fprintf(stderr,"Checking consistency between energy and charge groups...\n");
 +  check_eg_vs_cg(sys);
 +  
 +  if (debug)
 +    pr_symtab(debug,0,"After index",&sys->symtab);
 +  triple_check(mdparin,ir,sys,wi);
 +  close_symtab(&sys->symtab);
 +  if (debug)
 +    pr_symtab(debug,0,"After close",&sys->symtab);
 +
 +  /* make exclusions between QM atoms */
 +  if (ir->bQMMM) {
 +    if (ir->QMMMscheme==eQMMMschemenormal && ir->ns_type == ensSIMPLE ){
 +      gmx_fatal(FARGS,"electrostatic embedding only works with grid neighboursearching, use ns-type=grid instead\n");
 +    }
 +    else {
 +     generate_qmexcl(sys,ir,wi);
 +    }
 +  }
 +
 +  if (ftp2bSet(efTRN,NFILE,fnm)) {
 +    if (bVerbose)
 +      fprintf(stderr,"getting data from old trajectory ...\n");
 +    cont_status(ftp2fn(efTRN,NFILE,fnm),ftp2fn_null(efEDR,NFILE,fnm),
 +              bNeedVel,bGenVel,fr_time,ir,&state,sys,oenv);
 +  }
 +
 +    if (ir->ePBC==epbcXY && ir->nwall!=2)
 +    {
 +        clear_rvec(state.box[ZZ]);
 +    }
 +  
 +    if (ir->cutoff_scheme != ecutsVERLET && ir->rlist > 0)
 +    {
 +        set_warning_line(wi,mdparin,-1);
 +        check_chargegroup_radii(sys,ir,state.x,wi);
 +    }
 +
 +  if (EEL_FULL(ir->coulombtype)) {
 +    /* Calculate the optimal grid dimensions */
 +    copy_mat(state.box,box);
 +    if (ir->ePBC==epbcXY && ir->nwall==2)
 +      svmul(ir->wall_ewald_zfac,box[ZZ],box[ZZ]);
 +    if (ir->nkx > 0 && ir->nky > 0 && ir->nkz > 0)
 +    {
 +        /* Mark fourier_spacing as not used */
 +        ir->fourier_spacing = 0;
 +    }
 +    else if (ir->nkx != 0 && ir->nky != 0 && ir->nkz != 0)
 +    {
 +        set_warning_line(wi,mdparin,-1);
 +        warning_error(wi,"Some of the Fourier grid sizes are set, but all of them need to be set.");
 +    }
 +    max_spacing = calc_grid(stdout,box,ir->fourier_spacing,
 +                            &(ir->nkx),&(ir->nky),&(ir->nkz));
 +  }
 +
++  /* MRS: eventually figure out better logic for initializing the fep
++   values that makes declaring the lambda and declaring the state not
++   potentially conflict if not handled correctly. */
++  if (ir->efep != efepNO)
++  {
++      state.fep_state = ir->fepvals->init_fep_state;
++      for (i=0;i<efptNR;i++)
++      {
++          /* init_lambda trumps state definitions*/
++          if (ir->fepvals->init_lambda >= 0)
++          {
++              state.lambda[i] = ir->fepvals->init_lambda;
++          }
++          else
++          {
++              if (ir->fepvals->all_lambda[i] == NULL)
++              {
++                  gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
++              }
++              else
++              {
++                  state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
++              }
++          }
++      }
++  }
++
 +  if (ir->ePull != epullNO)
-   /* MRS: eventually figure out better logic for initializing the fep
-    values that makes declaring the lambda and declaring the state not
-    potentially conflict if not handled correctly. */
-   if (ir->efep != efepNO)
-   {
-       state.fep_state = ir->fepvals->init_fep_state;
-       for (i=0;i<efptNR;i++)
-       {
-           /* init_lambda trumps state definitions*/
-           if (ir->fepvals->init_lambda >= 0)
-           {
-               state.lambda[i] = ir->fepvals->init_lambda;
-           }
-           else
-           {
-               if (ir->fepvals->all_lambda[i] == NULL)
-               {
-                   gmx_fatal(FARGS,"Values of lambda not set for a free energy calculation!");
-               }
-               else
-               {
-                   state.lambda[i] = ir->fepvals->all_lambda[i][state.fep_state];
-               }
-           }
-       }
-   }
++      set_pull_init(ir,sys,state.x,state.box,state.lambda[efptMASS],oenv,opts->pull_start);
 +  
 +  if (ir->bRot)
 +  {
 +      set_reference_positions(ir->rot,sys,state.x,state.box,
 +                              opt2fn("-ref",NFILE,fnm),opt2bSet("-ref",NFILE,fnm),
 +                              wi);
 +  }
 +
 +  /*  reset_multinr(sys); */
 +  
 +  if (EEL_PME(ir->coulombtype)) {
 +      float ratio = pme_load_estimate(sys,ir,state.box);
 +      fprintf(stderr,"Estimate for the relative computational load of the PME mesh part: %.2f\n",ratio);
 +      /* With free energy we might need to do PME both for the A and B state
 +       * charges. This will double the cost, but the optimal performance will
 +       * then probably be at a slightly larger cut-off and grid spacing.
 +       */
 +      if ((ir->efep == efepNO && ratio > 1.0/2.0) ||
 +          (ir->efep != efepNO && ratio > 2.0/3.0)) {
 +          warning_note(wi,
 +                       "The optimal PME mesh load for parallel simulations is below 0.5\n"
 +                       "and for highly parallel simulations between 0.25 and 0.33,\n"
 +                       "for higher performance, increase the cut-off and the PME grid spacing.\n");
 +          if (ir->efep != efepNO) {
 +              warning_note(wi,
 +                           "For free energy simulations, the optimal load limit increases from 0.5 to 0.667\n");
 +          }
 +      }
 +  }
 +  
 +  {
 +        char warn_buf[STRLEN];
 +        double cio = compute_io(ir,sys->natoms,&sys->groups,F_NRE,1);
 +        sprintf(warn_buf,"This run will generate roughly %.0f Mb of data",cio);
 +        if (cio > 2000) {
 +            set_warning_line(wi,mdparin,-1);
 +            warning_note(wi,warn_buf);
 +        } else {
 +            printf("%s\n",warn_buf);
 +        }
 +    }
 +      
 +  if (bVerbose) 
 +    fprintf(stderr,"writing run input file...\n");
 +
 +  done_warning(wi,FARGS);
 +
 +  write_tpx_state(ftp2fn(efTPX,NFILE,fnm),ir,&state,sys);
 +  
 +  thanx(stderr);
 +  
 +  return 0;
 +}
index 3e2ecf3cf3c3b20a2768534aae05bdf9f54a0c0f,0000000000000000000000000000000000000000..44559ff4a9321ba3550f356d32ca89efc5fb648a
mode 100644,000000..100644
--- /dev/null
@@@ -1,743 -1,0 +1,744 @@@
-     if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 4.6.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2011, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "smalloc.h"
 +#include "network.h"
 +#include "calcgrid.h"
 +#include "pme.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "pme_loadbal.h"
 +
 +/* Parameters and setting for one PP-PME setup */
 +typedef struct {
 +    real rcut_coulomb;    /* Coulomb cut-off                              */
 +    real rlist;           /* pair-list cut-off                            */
 +    real rlistlong;       /* LR pair-list cut-off                         */
 +    int  nstcalclr;       /* frequency of evaluating long-range forces for group scheme */
 +    real spacing;         /* (largest) PME grid spacing                   */
 +    ivec grid;            /* the PME grid dimensions                      */
 +    real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
 +    real ewaldcoeff;      /* the Ewald coefficient                        */
 +    gmx_pme_t pmedata;    /* the data structure used in the PME code      */
 +
 +    int  count;           /* number of times this setup has been timed    */
 +    double cycles;        /* the fastest time for this setup in cycles    */
 +} pme_setup_t;
 +
 +/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
 +#define PME_LB_GRID_SCALE_FAC  0.8
 +/* In the initial scan, try to skip grids with uneven x/y/z spacing,
 + * checking if the "efficiency" is more than 5% worse than the previous grid.
 + */
 +#define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
 +/* Rerun up till 12% slower setups than the fastest up till now */
 +#define PME_LB_SLOW_FAC  1.12
 +/* If setups get more than 2% faster, do another round to avoid
 + * choosing a slower setup due to acceleration or fluctuations.
 + */
 +#define PME_LB_ACCEL_TOL 1.02
 +
 +enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
 +
 +const char *pmelblim_str[epmelblimNR] =
 +{ "no", "box size", "domain decompostion" };
 +
 +struct pme_load_balancing {
 +    int  nstage;        /* the current maximum number of stages */
 +
 +    real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
 +    real rcut_vdw;      /* Vdw cutoff (does not change) */
 +    real rcut_coulomb_start; /* Initial electrostatics cutoff */
 +    int  nstcalclr_start; /* Initial electrostatics cutoff */
 +    real rbuf_coulomb;  /* the pairlist buffer size */
 +    real rbuf_vdw;      /* the pairlist buffer size */
 +    matrix box_start;   /* the initial simulation box */
 +    int n;              /* the count of setup as well as the allocation size */
 +    pme_setup_t *setup; /* the PME+cutoff setups */
 +    int cur;            /* the current setup */
 +    int fastest;        /* fastest setup up till now */
 +    int start;          /* start of setup range to consider in stage>0 */
 +    int end;            /* end   of setup range to consider in stage>0 */
 +    int elimited;       /* was the balancing limited, uses enum above */
 +    int cutoff_scheme;  /* Verlet or group cut-offs */
 +
 +    int stage;          /* the current stage */
 +};
 +
 +void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
 +                      const t_inputrec *ir,matrix box,
 +                      const interaction_const_t *ic,
 +                      gmx_pme_t pmedata)
 +{
 +    pme_load_balancing_t pme_lb;
 +    real spm,sp;
 +    int  d;
 +
 +    snew(pme_lb,1);
 +
 +    /* Any number of stages >= 2 is supported */
 +    pme_lb->nstage   = 2;
 +
 +    pme_lb->cutoff_scheme = ir->cutoff_scheme;
 +
 +    if(pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        pme_lb->rbuf_vdw     = pme_lb->rbuf_coulomb;
 +    }
 +    else
 +    {
 +        if(ic->rcoulomb > ic->rlist)
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        }
 +        if(ic->rvdw > ic->rlist)
 +        {
 +            pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
 +        }
 +    }
 +
 +    copy_mat(box,pme_lb->box_start);
 +    if (ir->ePBC==epbcXY && ir->nwall==2)
 +    {
 +        svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
 +    }
 +
 +    pme_lb->n = 1;
 +    snew(pme_lb->setup,pme_lb->n);
 +
 +    pme_lb->rcut_vdw              = ic->rvdw;
 +    pme_lb->rcut_coulomb_start    = ir->rcoulomb;
 +    pme_lb->nstcalclr_start       = ir->nstcalclr;
 +    
 +    pme_lb->cur = 0;
 +    pme_lb->setup[0].rcut_coulomb = ic->rcoulomb;
 +    pme_lb->setup[0].rlist        = ic->rlist;
 +    pme_lb->setup[0].rlistlong    = ic->rlistlong;
 +    pme_lb->setup[0].nstcalclr    = ir->nstcalclr;
 +    pme_lb->setup[0].grid[XX]     = ir->nkx;
 +    pme_lb->setup[0].grid[YY]     = ir->nky;
 +    pme_lb->setup[0].grid[ZZ]     = ir->nkz;
 +    pme_lb->setup[0].ewaldcoeff   = ic->ewaldcoeff;
 +
 +    pme_lb->setup[0].pmedata  = pmedata;
 +    
 +    spm = 0;
 +    for(d=0; d<DIM; d++)
 +    {
 +        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
 +        if (sp > spm)
 +        {
 +            spm = sp;
 +        }
 +    }
 +    pme_lb->setup[0].spacing = spm;
 +
 +    if (ir->fourier_spacing > 0)
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
 +    }
 +
 +    pme_lb->stage = 0;
 +
 +    pme_lb->fastest  = 0;
 +    pme_lb->start    = 0;
 +    pme_lb->end      = 0;
 +    pme_lb->elimited = epmelblimNO;
 +
 +    *pme_lb_p = pme_lb;
 +}
 +
 +static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
 +                                            int pme_order)
 +{
 +    pme_setup_t *set;
 +    real fac,sp;
 +    real tmpr_coulomb,tmpr_vdw;
 +    int d;
 +
 +    /* Try to add a new setup with next larger cut-off to the list */
 +    pme_lb->n++;
 +    srenew(pme_lb->setup,pme_lb->n);
 +    set = &pme_lb->setup[pme_lb->n-1];
 +    set->pmedata = NULL;
 +
 +    fac = 1;
 +    do
 +    {
 +        fac *= 1.01;
 +        clear_ivec(set->grid);
 +        sp = calc_grid(NULL,pme_lb->box_start,
 +                       fac*pme_lb->setup[pme_lb->cur].spacing,
 +                       &set->grid[XX],
 +                       &set->grid[YY],
 +                       &set->grid[ZZ]);
 +
 +        /* In parallel we can't have grids smaller than 2*pme_order,
 +         * and we would anyhow not gain much speed at these grid sizes.
 +         */
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (set->grid[d] <= 2*pme_order)
 +            {
 +                pme_lb->n--;
 +
 +                return FALSE;
 +            }
 +        }
 +    }
 +    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
 +
 +    set->rcut_coulomb = pme_lb->cut_spacing*sp;
 +
 +    if(pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        set->rlist        = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        /* We dont use LR lists with Verlet, but this avoids if-statements in further checks */
 +        set->rlistlong    = set->rlist;
 +    }
 +    else
 +    {
 +        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
 +        set->rlist            = min(tmpr_coulomb,tmpr_vdw);
 +        set->rlistlong        = max(tmpr_coulomb,tmpr_vdw);
 +        
 +        /* Set the long-range update frequency */
 +        if(set->rlist == set->rlistlong)
 +        {
 +            /* No long-range interactions if the short-/long-range cutoffs are identical */
 +            set->nstcalclr = 0;
 +        }
 +        else if(pme_lb->nstcalclr_start==0 || pme_lb->nstcalclr_start==1)
 +        {
 +            /* We were not doing long-range before, but now we are since rlist!=rlistlong */
 +            set->nstcalclr = 1;
 +        }
 +        else
 +        {
 +            /* We were already doing long-range interactions from the start */
 +            if(pme_lb->rcut_vdw > pme_lb->rcut_coulomb_start)
 +            {
 +                /* We were originally doing long-range VdW-only interactions.
 +                 * If rvdw is still longer than rcoulomb we keep the original nstcalclr,
 +                 * but if the coulomb cutoff has become longer we should update the long-range
 +                 * part every step.
 +                 */
 +                set->nstcalclr = (tmpr_vdw > tmpr_coulomb) ? pme_lb->nstcalclr_start : 1;
 +            }
 +            else
 +            {
 +                /* We were not doing any long-range interaction from the start,
 +                 * since it is not possible to do twin-range coulomb for the PME interaction.
 +                 */
 +                set->nstcalclr = 1;
 +            }
 +        }
 +    }
 +    
 +    set->spacing      = sp;
 +    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
 +    set->grid_efficiency = 1;
 +    for(d=0; d<DIM; d++)
 +    {
 +        set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
 +    }
 +    /* The Ewald coefficient is inversly proportional to the cut-off */
 +    set->ewaldcoeff =
 +        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
 +
 +    set->count   = 0;
 +    set->cycles  = 0;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"PME loadbal: grid %d %d %d, coulomb cutoff %f\n",
 +                set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut_coulomb);
 +    }
 +    return TRUE;
 +}
 +
 +static void print_grid(FILE *fp_err,FILE *fp_log,
 +                       const char *pre,
 +                       const char *desc,
 +                       const pme_setup_t *set,
 +                       double cycles)
 +{
 +    char buf[STRLEN],buft[STRLEN];
 +
 +    if (cycles >= 0)
 +    {
 +        sprintf(buft,": %.1f M-cycles",cycles*1e-6);
 +    }
 +    else
 +    {
 +        buft[0] = '\0';
 +    }
 +    sprintf(buf,"%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f%s",
 +            pre,
 +            desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut_coulomb,
 +            buft);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err,"\r%s\n",buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log,"%s\n",buf);
 +    }
 +}
 +
 +static int pme_loadbal_end(pme_load_balancing_t pme_lb)
 +{
 +    /* In the initial stage only n is set; end is not set yet */
 +    if (pme_lb->end > 0)
 +    {
 +        return pme_lb->end;
 +    }
 +    else
 +    {
 +        return pme_lb->n;
 +    }
 +}
 +
 +static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
 +                                  gmx_large_int_t step,
 +                                  pme_load_balancing_t pme_lb)
 +{
 +    char buf[STRLEN],sbuf[22];
 +
 +    sprintf(buf,"step %4s: the %s limited the PME load balancing to a coulomb cut-off of %.3f",
 +            gmx_step_str(step,sbuf),
 +            pmelblim_str[pme_lb->elimited],
 +            pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err,"\r%s\n",buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log,"%s\n",buf);
 +    }
 +}
 +
 +static void switch_to_stage1(pme_load_balancing_t pme_lb)
 +{
 +    pme_lb->start = 0;
 +    while (pme_lb->start+1 < pme_lb->n &&
 +           (pme_lb->setup[pme_lb->start].count == 0 ||
 +            pme_lb->setup[pme_lb->start].cycles >
 +            pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
 +    {
 +        pme_lb->start++;
 +    }
 +    while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
 +    {
 +        pme_lb->start--;
 +    }
 +
 +    pme_lb->end = pme_lb->n;
 +    if (pme_lb->setup[pme_lb->end-1].count > 0 &&
 +        pme_lb->setup[pme_lb->end-1].cycles >
 +        pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->end--;
 +    }
 +
 +    pme_lb->stage = 1;
 +
 +    /* Next we want to choose setup pme_lb->start, but as we will increase
 +     * pme_ln->cur by one right after returning, we subtract 1 here.
 +     */
 +    pme_lb->cur = pme_lb->start - 1;
 +}
 +
 +gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
 +                          t_commrec *cr,
 +                          FILE *fp_err,
 +                          FILE *fp_log,
 +                          t_inputrec *ir,
 +                          t_state *state,
 +                          double cycles,
 +                          interaction_const_t *ic,
 +                          nonbonded_verlet_t *nbv,
 +                          gmx_pme_t *pmedata,
 +                          gmx_large_int_t step)
 +{
 +    gmx_bool OK;
 +    pme_setup_t *set;
 +    double cycles_fast;
 +    char buf[STRLEN],sbuf[22];
 +    real rtab;
 +    gmx_bool bUsesSimpleTables = TRUE;
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        return FALSE;
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1,&cycles,cr);
 +        cycles /= cr->nnodes;
 +    }
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +    set->count++;
 +
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (set->count % 2 == 1)
 +    {
 +        /* Skip the first cycle, because the first step after a switch
 +         * is much slower due to allocation and/or caching effects.
 +         */
 +        return TRUE;
 +    }
 +
 +    sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
 +    print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
 +
 +    if (set->count <= 2)
 +    {
 +        set->cycles = cycles;
 +    }
 +    else
 +    {
 +        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
 +            pme_lb->stage == pme_lb->nstage - 1)
 +        {
 +            /* The performance went up a lot (due to e.g. DD load balancing).
 +             * Add a stage, keep the minima, but rescan all setups.
 +             */
 +            pme_lb->nstage++;
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
 +                        "Increased the number stages to %d"
 +                        " and ignoring the previous performance\n",
 +                        set->grid[XX],set->grid[YY],set->grid[ZZ],
 +                        cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
 +                        pme_lb->nstage);
 +            }
 +        }
 +        set->cycles = min(set->cycles,cycles);
 +    }
 +
 +    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
 +    {
 +        pme_lb->fastest = pme_lb->cur;
 +    }
 +    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
 +
 +    /* Check in stage 0 if we should stop scanning grids.
 +     * Stop when the time is more than SLOW_FAC longer than the fastest.
 +     */
 +    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
 +        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->n = pme_lb->cur + 1;
 +        /* Done with scanning, go to stage 1 */
 +        switch_to_stage1(pme_lb);
 +    }
 +
 +    if (pme_lb->stage == 0)
 +    {
 +        int gridsize_start;
 +
 +        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
 +
 +        do
 +        {
 +            if (pme_lb->cur+1 < pme_lb->n)
 +            {
 +                /* We had already generated the next setup */
 +                OK = TRUE;
 +            }
 +            else
 +            {
 +                /* Find the next setup */
 +                OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
 +            }
 +
 +            if (OK && ir->ePBC != epbcNONE)
 +            {
 +                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
 +                      <= max_cutoff2(ir->ePBC,state->box));
 +                if (!OK)
 +                {
 +                    pme_lb->elimited = epmelblimBOX;
 +                }
 +            }
 +
 +            if (OK)
 +            {
 +                pme_lb->cur++;
 +
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    OK = change_dd_cutoff(cr,state,ir,
 +                                          pme_lb->setup[pme_lb->cur].rlistlong);
 +                    if (!OK)
 +                    {
 +                        /* Failed: do not use this setup */
 +                        pme_lb->cur--;
 +                        pme_lb->elimited = epmelblimDD;
 +                    }
 +                }
 +            }
 +            if (!OK)
 +            {
 +                /* We hit the upper limit for the cut-off,
 +                 * the setup should not go further than cur.
 +                 */
 +                pme_lb->n = pme_lb->cur + 1;
 +                print_loadbal_limited(fp_err,fp_log,step,pme_lb);
 +                /* Switch to the next stage */
 +                switch_to_stage1(pme_lb);
 +            }
 +        }
 +        while (OK &&
 +               !(pme_lb->setup[pme_lb->cur].grid[XX]*
 +                 pme_lb->setup[pme_lb->cur].grid[YY]*
 +                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
 +                 gridsize_start*PME_LB_GRID_SCALE_FAC
 +                 &&
 +                 pme_lb->setup[pme_lb->cur].grid_efficiency <
 +                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
 +    }
 +
 +    if (pme_lb->stage > 0 && pme_lb->end == 1)
 +    {
 +        pme_lb->cur = 0;
 +        pme_lb->stage = pme_lb->nstage;
 +    }
 +    else if (pme_lb->stage > 0 && pme_lb->end > 1)
 +    {
 +        /* If stage = nstage-1:
 +         *   scan over all setups, rerunning only those setups
 +         *   which are not much slower than the fastest
 +         * else:
 +         *   use the next setup
 +         */
 +        do
 +        {
 +            pme_lb->cur++;
 +            if (pme_lb->cur == pme_lb->end)
 +            {
 +                pme_lb->stage++;
 +                pme_lb->cur = pme_lb->start;
 +            }
 +        }
 +        while (pme_lb->stage == pme_lb->nstage - 1 &&
 +               pme_lb->setup[pme_lb->cur].count > 0 &&
 +               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
 +
 +        if (pme_lb->stage == pme_lb->nstage)
 +        {
 +            /* We are done optimizing, use the fastest setup we found */
 +            pme_lb->cur = pme_lb->fastest;
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
 +    {
 +        OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlistlong);
 +        if (!OK)
 +        {
 +            /* Failsafe solution */
 +            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
 +            {
 +                pme_lb->stage--;
 +            }
 +            pme_lb->fastest  = 0;
 +            pme_lb->start    = 0;
 +            pme_lb->end      = pme_lb->cur;
 +            pme_lb->cur      = pme_lb->start;
 +            pme_lb->elimited = epmelblimDD;
 +            print_loadbal_limited(fp_err,fp_log,step,pme_lb);
 +        }
 +    }
 +
 +    /* Change the Coulomb cut-off and the PME grid */
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +
 +    ic->rcoulomb   = set->rcut_coulomb;
 +    ic->rlist      = set->rlist;
 +    ic->rlistlong  = set->rlistlong;
 +    ir->nstcalclr  = set->nstcalclr;
 +    ic->ewaldcoeff = set->ewaldcoeff;
 +
 +    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
++    if (pme_lb->cutoff_scheme == ecutsVERLET &&
++        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +    {
 +        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
 +    }
 +    else
 +    {
 +        init_interaction_const_tables(NULL,ic,bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
 +    {
 +        init_interaction_const_tables(NULL,ic,bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (cr->duty & DUTY_PME)
 +    {
 +        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
 +        {
 +            /* Generate a new PME data structure,
 +             * copying part of the old pointers.
 +             */
 +            gmx_pme_reinit(&set->pmedata,
 +                           cr,pme_lb->setup[0].pmedata,ir,
 +                           set->grid);
 +        }
 +        *pmedata = set->pmedata;
 +    }
 +    else
 +    {
 +        /* Tell our PME-only node to switch grid */
 +        gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
 +    }
 +
 +    if (debug)
 +    {
 +        print_grid(NULL,debug,"","switched to",set,-1);
 +    }
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        print_grid(fp_err,fp_log,"","optimal",set,-1);
 +    }
 +
 +    return TRUE;
 +}
 +
 +void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
 +{
 +    pme_lb->nstage += n;
 +}
 +
 +static int pme_grid_points(const pme_setup_t *setup)
 +{
 +    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
 +}
 +
 +static void print_pme_loadbal_setting(FILE *fplog,
 +                                     char *name,
 +                                     const pme_setup_t *setup)
 +{
 +    fprintf(fplog,
 +            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
 +            name,
 +            setup->rcut_coulomb,setup->rlist,
 +            setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
 +            setup->spacing,1/setup->ewaldcoeff);
 +}
 +
 +static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
 +                                       FILE *fplog)
 +{
 +    double pp_ratio,grid_ratio;
 +
 +    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlistlong,3.0);
 +    grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
 +        (double)pme_grid_points(&pme_lb->setup[0]);
 +
 +    fprintf(fplog,"\n");
 +    fprintf(fplog,"       P P   -   P M E   L O A D   B A L A N C I N G\n");
 +    fprintf(fplog,"\n");
 +    /* Here we only warn when the optimal setting is the last one */
 +    if (pme_lb->elimited != epmelblimNO &&
 +        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
 +    {
 +        fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
 +                pmelblim_str[pme_lb->elimited]);
 +        fprintf(fplog,"       you might not have reached a good load balance.\n");
 +        if (pme_lb->elimited == epmelblimDD)
 +        {
 +            fprintf(fplog,"       Try different mdrun -dd settings or lower the -dds value.\n");
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
 +    fprintf(fplog,"           particle-particle                    PME\n");
 +    fprintf(fplog,"            rcoulomb  rlist            grid      spacing   1/beta\n");
 +    print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
 +    print_pme_loadbal_setting(fplog,"final"  ,&pme_lb->setup[pme_lb->cur]);
 +    fprintf(fplog," cost-ratio           %4.2f             %4.2f\n",
 +            pp_ratio,grid_ratio);
 +    fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
 +    fprintf(fplog,"\n");
 +}
 +
 +void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
 +{
 +    if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
 +    {
 +        print_pme_loadbal_settings(pme_lb,fplog);
 +    }
 +
 +    /* TODO: Here we should free all pointers in pme_lb,
 +     * but as it contains pme data structures,
 +     * we need to first make pme.c free all data.
 +     */
 +}