From: Roland Schulz Date: Thu, 26 Jul 2012 23:32:43 +0000 (-0400) Subject: Merge remote-tracking branch 'origin/release-4-6' into HEAD X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=2defe580657eadf2b42860a2c45bc82be020222a;p=alexxy%2Fgromacs.git Merge remote-tracking branch 'origin/release-4-6' into HEAD Conflicts: CMakeLists.txt Change-Id: Icf9d0d92b977e35d42d55f1a077a2cdc8fbad86f --- 2defe580657eadf2b42860a2c45bc82be020222a diff --cc CMakeLists.txt index 8cac7ae235,80f305949f..0b45bc69ca --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -1021,7 -984,7 +986,6 @@@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darw endif() add_subdirectory(share) - add_subdirectory(man) -add_subdirectory(include) add_subdirectory(src) add_subdirectory(scripts) diff --cc src/gromacs/gmxlib/vmdio.c index df4cfe6d56,0000000000..d8611a2483 mode 100644,000000..100644 --- a/src/gromacs/gmxlib/vmdio.c +++ b/src/gromacs/gmxlib/vmdio.c @@@ -1,414 -1,0 +1,414 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This file is part of Gromacs Copyright (c) 1991-2008 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ +#ifdef HAVE_CONFIG_H +#include +#endif +#include "gromacs/utility/gmx_header_config.h" + + + +/* Derived from PluginMgr.C and catdcd.c */ + +/* PluginMgr.C: Copyright: */ +/*************************************************************************** + *cr + *cr (C) Copyright 1995-2009 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr +Developed by: Theoretical and Computational Biophysics Group + University of Illinois at Urbana-Champaign + http://www.ks.uiuc.edu/ + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the Software), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to +do so, subject to the following conditions: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimers. + +Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimers in the documentation +and/or other materials provided with the distribution. + +Neither the names of Theoretical and Computational Biophysics Group, +University of Illinois at Urbana-Champaign, nor the names of its contributors +may be used to endorse or promote products derived from this Software without +specific prior written permission. + +THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS WITH THE SOFTWARE. + ***************************************************************************/ + +/* catdcd.c: Copyright: */ +/*****************************************************************************/ +/* */ +/* (C) Copyright 2001-2005 Justin Gullingsrud and the University of Illinois.*/ +/* */ +/*****************************************************************************/ + +#include +#include +#include +#include + +/* + * Plugin header files; get plugin source from www.ks.uiuc.edu/Research/vmd" + */ +#include "external/vmd_molfile/molfile_plugin.h" +#include "external/vmd_molfile/vmddlopen.h" +#ifndef GMX_NATIVE_WINDOWS +#include +#else +#include +#include +#endif +#include "smalloc.h" +#include "futil.h" +#include "vmdio.h" + + +#include "types/simple.h" +#include "vec.h" +#include "gmxfio.h" + + +typedef int (*initfunc)(void); +typedef int (*regfunc)(void *, vmdplugin_register_cb); +typedef int (*finifunc)(void); + + + +static int register_cb(void *v, vmdplugin_t *p) { + const char *key = p->name; + t_gmxvmdplugin *vmdplugin = (t_gmxvmdplugin*)v; + + if (strcmp(key,vmdplugin->filetype)==0) + { + vmdplugin->api = (molfile_plugin_t *)p; + } + return VMDPLUGIN_SUCCESS; +} + +static int load_sharedlibrary_plugins(const char *fullpath,t_gmxvmdplugin* vmdplugin) { + /* Open the dll; try to execute the init function. */ + void *handle, *ifunc, *registerfunc; + handle = vmddlopen(fullpath); + if (!handle) { + if (debug) fprintf(debug, "\nUnable to open dynamic library %s.\n%s\n", fullpath, vmddlerror()); /*only to debug because of stdc++ erros */ + return 0; + } + + ifunc = vmddlsym(handle, "vmdplugin_init"); - if (ifunc && ((initfunc)(ifunc))()) { ++ if (!ifunc || ((initfunc)(ifunc))()) { + printf("\nvmdplugin_init() for %s returned an error; plugin(s) not loaded.\n", fullpath); + vmddlclose(handle); + return 0; + } + + registerfunc = vmddlsym(handle, "vmdplugin_register"); + if (!registerfunc) { + printf("\nDidn't find the register function in %s; plugin(s) not loaded.\n", fullpath); + vmddlclose(handle); + return 0; + } else { + /* Load plugins from the library.*/ + ((regfunc)registerfunc)(vmdplugin, register_cb); + } + + /* in case this library does not support the filetype, close it */ + if (vmdplugin->api == NULL) + { + vmddlclose(handle); + } + + return 1; +} + +/*return: 1: success, 0: last frame, -1: error*/ +gmx_bool read_next_vmd_frame(int status,t_trxframe *fr) +{ + int rc,i; + rvec vec, angle; + molfile_timestep_t ts; + + + fr->bV = fr->vmdplugin->bV; + +#ifdef GMX_DOUBLE + snew(ts.coords, fr->natoms*3); + if (fr->bV) + { + snew(ts.velocities, fr->natoms*3); + } +#else + ts.coords = (float*)fr->x; + if (fr->bV) + { + ts.velocities = (float*)fr->v; + } +#endif + + rc = fr->vmdplugin->api->read_next_timestep(fr->vmdplugin->handle, fr->natoms, &ts); + + if (rc < -1) { + fprintf(stderr, "\nError reading input file (error code %d)\n", rc); + } + if (rc < 0) + { + fr->vmdplugin->api->close_file_read(fr->vmdplugin->handle); + return 0; + } + +#ifdef GMX_DOUBLE + for (i=0;inatoms;i++) + { + fr->x[i][0] = .1*ts.coords[i*3]; + fr->x[i][1] = .1*ts.coords[i*3+1]; + fr->x[i][2] = .1*ts.coords[i*3+2]; + if (fr->bV) + { + fr->v[i][0] = .1*ts.velocities[i*3]; + fr->v[i][1] = .1*ts.velocities[i*3+1]; + fr->v[i][2] = .1*ts.velocities[i*3+2]; + } + } + sfree(ts.coords); + if (fr->bV) + { + sfree(ts.velocities); + } +#else + for (i=0;inatoms;i++) + { + svmul(.1,fr->x[i],fr->x[i]); + if (fr->bV) + { + svmul(.1,fr->v[i],fr->v[i]); + } + } +#endif + + fr->bX = 1; + fr->bBox = 1; + vec[0] = .1*ts.A; vec[1] = .1*ts.B; vec[2] = .1*ts.C; + angle[0] = ts.alpha; angle[1] = ts.beta; angle[2] = ts.gamma; + matrix_convert(fr->box,vec,angle); + if (fr->vmdplugin->api->abiversion>10) + { + fr->bTime = TRUE; + fr->time = ts.physical_time; + } + else + { + fr->bTime = FALSE; + } + + + return 1; +} + +static int load_vmd_library(const char *fn, t_gmxvmdplugin *vmdplugin) +{ + char pathname[GMX_PATH_MAX],filename[GMX_PATH_MAX]; + const char *pathenv; + const char *err; + int i; + int ret=0; + char pathenv_buffer[GMX_PATH_MAX]; +#ifndef GMX_NATIVE_WINDOWS + glob_t globbuf; + const char *defpath_suffix = "/plugins/*/molfile"; + const char *defpathenv = GMX_VMD_PLUGIN_PATH; +#else + WIN32_FIND_DATA ffd; + HANDLE hFind = INVALID_HANDLE_VALUE; + char progfolder[GMX_PATH_MAX]; + char defpathenv[GMX_PATH_MAX]; + const char *defpath_suffix = "\\plugins\\WIN32\\molfile"; + SHGetFolderPath(NULL,CSIDL_PROGRAM_FILES,NULL,SHGFP_TYPE_CURRENT,progfolder); + sprintf(defpathenv,"%s\\University of Illinois\\VMD\\plugins\\WIN32\\molfile",progfolder); +#endif + + vmdplugin->api = NULL; + vmdplugin->filetype = strrchr(fn,'.'); + if (!vmdplugin->filetype) + { + return 0; + } + vmdplugin->filetype++; + + /* First look for an explicit path given at run time for the + * plugins, then an implicit run-time path, and finally for one + * given at configure time. This last might be hard-coded to the + * default for VMD installs. */ + pathenv = getenv("VMD_PLUGIN_PATH"); + if (pathenv==NULL) + { + pathenv = getenv("VMDDIR"); + if (NULL == pathenv) + { + printf("\nNeither VMD_PLUGIN_PATH or VMDDIR set. "); + printf("Using default location:\n%s\n",defpathenv); + pathenv=defpathenv; + } + else + { + printf("\nVMD_PLUGIN_PATH no set, but VMDDIR is set. "); +#ifdef _MSC_VER + _snprintf_s(pathenv_buffer, sizeof(pathenv_buffer), _TRUNCATE, "%s%s", pathenv, defpath_suffix); +#else + snprintf(pathenv_buffer, sizeof(pathenv_buffer), "%s%s", pathenv, defpath_suffix); +#endif + printf("Using semi-default location:\n%s\n",pathenv_buffer); + pathenv = pathenv_buffer; + } + } + strncpy(pathname,pathenv,sizeof(pathname)); +#ifndef GMX_NATIVE_WINDOWS + strcat(pathname,"/*.so"); + glob(pathname, 0, NULL, &globbuf); + if (globbuf.gl_pathc == 0) + { + printf("\nNo VMD Plugins found\n" + "Set the environment variable VMD_PLUGIN_PATH to the molfile folder within the\n" + "VMD installation.\n" + "The architecture (e.g. 32bit versus 64bit) of Gromacs and VMD has to match.\n"); + return 0; + } + for (i=0; iapi == NULL; i++) + { + /* FIXME: Undefined which plugin is chosen if more than one plugin + can read a certain file ending. Requires some additional command + line option or enviroment variable to specify which plugin should + be picked. + */ + ret|=load_sharedlibrary_plugins(globbuf.gl_pathv[i],vmdplugin); + } + globfree(&globbuf); +#else + strcat(pathname,"\\*.so"); + hFind = FindFirstFile(pathname, &ffd); + if (INVALID_HANDLE_VALUE == hFind) + { + printf("\nNo VMD Plugins found\n"); + return 0; + } + do + { + sprintf(filename,"%s\\%s",pathenv,ffd.cFileName); + ret|=load_sharedlibrary_plugins(filename,vmdplugin); + } + while (FindNextFile(hFind, &ffd ) != 0 && vmdplugin->api == NULL ); + FindClose(hFind); +#endif + + if (!ret) + { + printf("\nCould not open any VMD library.\n"); + err = vmddlerror(); + if (!err) + { + printf("Compiled with dlopen?\n"); + } + else + { + printf("Last error:\n%s\n",err); + } + return 0; + } + + if (vmdplugin->api == NULL) + { + printf("\nNo plugin for %s found\n",vmdplugin->filetype); + return 0; + } + + if (vmdplugin->api->abiversion < 10) + { + printf("\nPlugin and/or VMD is too old. At least VMD 1.8.6 is required.\n"); + return 0; + } + + printf("\nUsing VMD plugin: %s (%s)\n",vmdplugin->api->name,vmdplugin->api->prettyname); + + return 1; + +} + +int read_first_vmd_frame(int *status,const char *fn,t_trxframe *fr,int flags) +{ + molfile_timestep_metadata_t *metadata=NULL; + + snew(fr->vmdplugin,1); + if (!load_vmd_library(fn,fr->vmdplugin)) + { + return 0; + } + + fr->vmdplugin->handle = fr->vmdplugin->api->open_file_read(fn, fr->vmdplugin->filetype, &fr->natoms); + + if (!fr->vmdplugin->handle) { + fprintf(stderr, "\nError: could not open file '%s' for reading.\n", + fn); + return 0; + } + + if (fr->natoms == MOLFILE_NUMATOMS_UNKNOWN) { + fprintf(stderr, "\nFormat of file %s does not record number of atoms.\n", fn); + return 0; + } else if (fr->natoms == MOLFILE_NUMATOMS_NONE) { + fprintf(stderr, "\nNo atoms found by VMD plugin in file %s.\n", fn ); + return 0; + } else if (fr->natoms < 1) { /*should not be reached*/ + fprintf(stderr, "\nUnknown number of atoms %d for VMD plugin opening file %s.\n", + fr->natoms, fn ); + return 0; + } + + snew(fr->x,fr->natoms); + + fr->vmdplugin->bV = 0; + if (fr->vmdplugin->api->abiversion > 10 && fr->vmdplugin->api->read_timestep_metadata) + { + fr->vmdplugin->api->read_timestep_metadata(fr->vmdplugin->handle, metadata); + assert(metadata); + fr->vmdplugin->bV = metadata->has_velocities; + if (fr->vmdplugin->bV) + { + snew(fr->v,fr->natoms); + } + } + else + { + fprintf(stderr, + "\nThis trajectory is being read with a VMD plug-in from before VMD" + "\nversion 1.8, or from a trajectory that lacks time step metadata." + "\nEither way, GROMACS cannot tell whether the trajectory has velocities.\n"); + } + return 1; + +} diff --cc src/gromacs/mdlib/pme.c index ac1892f3a7,0000000000..8837bb5eda mode 100644,000000..100644 --- a/src/gromacs/mdlib/pme.c +++ b/src/gromacs/mdlib/pme.c @@@ -1,4352 -1,0 +1,4364 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * GROwing Monsters And Cloning Shrimps + */ +/* IMPORTANT FOR DEVELOPERS: + * + * Triclinic pme stuff isn't entirely trivial, and we've experienced + * some bugs during development (many of them due to me). To avoid + * this in the future, please check the following things if you make + * changes in this file: + * + * 1. You should obtain identical (at least to the PME precision) + * energies, forces, and virial for + * a rectangular box and a triclinic one where the z (or y) axis is + * tilted a whole box side. For instance you could use these boxes: + * + * rectangular triclinic + * 2 0 0 2 0 0 + * 0 2 0 0 2 0 + * 0 0 6 2 2 6 + * + * 2. You should check the energy conservation in a triclinic box. + * + * It might seem an overkill, but better safe than sorry. + * /Erik 001109 + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifdef GMX_LIB_MPI +#include +#endif +#ifdef GMX_THREAD_MPI +#include "tmpi.h" +#endif + +#include +#include +#include +#include +#include "typedefs.h" +#include "txtdump.h" +#include "vec.h" +#include "gmxcomplex.h" +#include "smalloc.h" +#include "futil.h" +#include "coulomb.h" +#include "gmx_fatal.h" +#include "pme.h" +#include "network.h" +#include "physics.h" +#include "nrnb.h" +#include "copyrite.h" +#include "gmx_wallcycle.h" +#include "gmx_parallel_3dfft.h" +#include "pdbio.h" +#include "gmx_cyclecounter.h" +#include "macros.h" + +/* Single precision, with SSE2 or higher available */ +#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE) + +#include "gmx_x86_sse2.h" +#include "gmx_math_x86_sse2_single.h" + +#define PME_SSE +/* Some old AMD processors could have problems with unaligned loads+stores */ +#ifndef GMX_FAHCORE +#define PME_SSE_UNALIGNED +#endif +#endif + +#define DFT_TOL 1e-7 +/* #define PRT_FORCE */ +/* conditions for on the fly time-measurement */ +/* #define TAKETIME (step > 1 && timesteps < 10) */ +#define TAKETIME FALSE + +/* #define PME_TIME_THREADS */ + +#ifdef GMX_DOUBLE +#define mpi_type MPI_DOUBLE +#else +#define mpi_type MPI_FLOAT +#endif + +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */ +#define GMX_CACHE_SEP 64 + +/* We only define a maximum to be able to use local arrays without allocation. + * An order larger than 12 should never be needed, even for test cases. + * If needed it can be changed here. + */ +#define PME_ORDER_MAX 12 + +/* Internal datastructures */ +typedef struct { + int send_index0; + int send_nindex; + int recv_index0; + int recv_nindex; +} pme_grid_comm_t; + +typedef struct { +#ifdef GMX_MPI + MPI_Comm mpi_comm; +#endif + int nnodes,nodeid; + int *s2g0; + int *s2g1; + int noverlap_nodes; + int *send_id,*recv_id; + pme_grid_comm_t *comm_data; + real *sendbuf; + real *recvbuf; +} pme_overlap_t; + +typedef struct { + int *n; /* Cumulative counts of the number of particles per thread */ + int nalloc; /* Allocation size of i */ + int *i; /* Particle indices ordered on thread index (n) */ +} thread_plist_t; + +typedef struct { + int n; + int *ind; + splinevec theta; + splinevec dtheta; +} splinedata_t; + +typedef struct { + int dimind; /* The index of the dimension, 0=x, 1=y */ + int nslab; + int nodeid; +#ifdef GMX_MPI + MPI_Comm mpi_comm; +#endif + + int *node_dest; /* The nodes to send x and q to with DD */ + int *node_src; /* The nodes to receive x and q from with DD */ + int *buf_index; /* Index for commnode into the buffers */ + + int maxshift; + + int npd; + int pd_nalloc; + int *pd; + int *count; /* The number of atoms to send to each node */ + int **count_thread; + int *rcount; /* The number of atoms to receive */ + + int n; + int nalloc; + rvec *x; + real *q; + rvec *f; + gmx_bool bSpread; /* These coordinates are used for spreading */ + int pme_order; + ivec *idx; + rvec *fractx; /* Fractional coordinate relative to the + * lower cell boundary + */ + int nthread; + int *thread_idx; /* Which thread should spread which charge */ + thread_plist_t *thread_plist; + splinedata_t *spline; +} pme_atomcomm_t; + +#define FLBS 3 +#define FLBSZ 4 + +typedef struct { + ivec ci; /* The spatial location of this grid */ + ivec n; /* The size of *grid, including order-1 */ + ivec offset; /* The grid offset from the full node grid */ + int order; /* PME spreading order */ + real *grid; /* The grid local thread, size n */ +} pmegrid_t; + +typedef struct { + pmegrid_t grid; /* The full node grid (non thread-local) */ + int nthread; /* The number of threads operating on this grid */ + ivec nc; /* The local spatial decomposition over the threads */ + pmegrid_t *grid_th; /* Array of grids for each thread */ + int **g2t; /* The grid to thread index */ + ivec nthread_comm; /* The number of threads to communicate with */ +} pmegrids_t; + + +typedef struct { +#ifdef PME_SSE + /* Masks for SSE aligned spreading and gathering */ + __m128 mask_SSE0[6],mask_SSE1[6]; +#else + int dummy; /* C89 requires that struct has at least one member */ +#endif +} pme_spline_work_t; + +typedef struct { + /* work data for solve_pme */ + int nalloc; + real * mhx; + real * mhy; + real * mhz; + real * m2; + real * denom; + real * tmp1_alloc; + real * tmp1; + real * eterm; + real * m2inv; + + real energy; + matrix vir; +} pme_work_t; + +typedef struct gmx_pme { + int ndecompdim; /* The number of decomposition dimensions */ + int nodeid; /* Our nodeid in mpi->mpi_comm */ + int nodeid_major; + int nodeid_minor; + int nnodes; /* The number of nodes doing PME */ + int nnodes_major; + int nnodes_minor; + + MPI_Comm mpi_comm; + MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */ +#ifdef GMX_MPI + MPI_Datatype rvec_mpi; /* the pme vector's MPI type */ +#endif + + int nthread; /* The number of threads doing PME */ + + gmx_bool bPPnode; /* Node also does particle-particle forces */ + gmx_bool bFEP; /* Compute Free energy contribution */ + int nkx,nky,nkz; /* Grid dimensions */ + gmx_bool bP3M; /* Do P3M: optimize the influence function */ + int pme_order; + real epsilon_r; + + pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */ + pmegrids_t pmegridB; + /* The PME charge spreading grid sizes/strides, includes pme_order-1 */ + int pmegrid_nx,pmegrid_ny,pmegrid_nz; + /* pmegrid_nz might be larger than strictly necessary to ensure + * memory alignment, pmegrid_nz_base gives the real base size. + */ + int pmegrid_nz_base; + /* The local PME grid starting indices */ + int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz; + + /* Work data for spreading and gathering */ - pme_spline_work_t spline_work; ++ pme_spline_work_t *spline_work; + + real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */ + real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */ + int fftgrid_nx,fftgrid_ny,fftgrid_nz; + + t_complex *cfftgridA; /* Grids for complex FFT data */ + t_complex *cfftgridB; + int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz; + + gmx_parallel_3dfft_t pfft_setupA; + gmx_parallel_3dfft_t pfft_setupB; + + int *nnx,*nny,*nnz; + real *fshx,*fshy,*fshz; + + pme_atomcomm_t atc[2]; /* Indexed on decomposition index */ + matrix recipbox; + splinevec bsp_mod; + + pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */ + + pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */ + + rvec *bufv; /* Communication buffer */ + real *bufr; /* Communication buffer */ + int buf_nalloc; /* The communication buffer size */ + + /* thread local work data for solve_pme */ + pme_work_t *work; + + /* Work data for PME_redist */ + gmx_bool redist_init; + int * scounts; + int * rcounts; + int * sdispls; + int * rdispls; + int * sidx; + int * idxa; + real * redist_buf; + int redist_buf_nalloc; + + /* Work data for sum_qgrid */ + real * sum_qgrid_tmp; + real * sum_qgrid_dd_tmp; +} t_gmx_pme; + + +static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc, + int start,int end,int thread) +{ + int i; + int *idxptr,tix,tiy,tiz; + real *xptr,*fptr,tx,ty,tz; + real rxx,ryx,ryy,rzx,rzy,rzz; + int nx,ny,nz; + int start_ix,start_iy,start_iz; + int *g2tx,*g2ty,*g2tz; + gmx_bool bThreads; + int *thread_idx=NULL; + thread_plist_t *tpl=NULL; + int *tpl_n=NULL; + int thread_i; + + nx = pme->nkx; + ny = pme->nky; + nz = pme->nkz; + + start_ix = pme->pmegrid_start_ix; + start_iy = pme->pmegrid_start_iy; + start_iz = pme->pmegrid_start_iz; + + rxx = pme->recipbox[XX][XX]; + ryx = pme->recipbox[YY][XX]; + ryy = pme->recipbox[YY][YY]; + rzx = pme->recipbox[ZZ][XX]; + rzy = pme->recipbox[ZZ][YY]; + rzz = pme->recipbox[ZZ][ZZ]; + + g2tx = pme->pmegridA.g2t[XX]; + g2ty = pme->pmegridA.g2t[YY]; + g2tz = pme->pmegridA.g2t[ZZ]; + + bThreads = (atc->nthread > 1); + if (bThreads) + { + thread_idx = atc->thread_idx; + + tpl = &atc->thread_plist[thread]; + tpl_n = tpl->n; + for(i=0; inthread; i++) + { + tpl_n[i] = 0; + } + } + + for(i=start; ix[i]; + idxptr = atc->idx[i]; + fptr = atc->fractx[i]; + + /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */ + tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 ); + ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 ); + tz = nz * ( xptr[ZZ] * rzz + 2.0 ); + + tix = (int)(tx); + tiy = (int)(ty); + tiz = (int)(tz); + + /* Because decomposition only occurs in x and y, + * we never have a fraction correction in z. + */ + fptr[XX] = tx - tix + pme->fshx[tix]; + fptr[YY] = ty - tiy + pme->fshy[tiy]; + fptr[ZZ] = tz - tiz; + + idxptr[XX] = pme->nnx[tix]; + idxptr[YY] = pme->nny[tiy]; + idxptr[ZZ] = pme->nnz[tiz]; + +#ifdef DEBUG + range_check(idxptr[XX],0,pme->pmegrid_nx); + range_check(idxptr[YY],0,pme->pmegrid_ny); + range_check(idxptr[ZZ],0,pme->pmegrid_nz); +#endif + + if (bThreads) + { + thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]]; + thread_idx[i] = thread_i; + tpl_n[thread_i]++; + } + } + + if (bThreads) + { + /* Make a list of particle indices sorted on thread */ + + /* Get the cumulative count */ + for(i=1; inthread; i++) + { + tpl_n[i] += tpl_n[i-1]; + } + /* The current implementation distributes particles equally + * over the threads, so we could actually allocate for that + * in pme_realloc_atomcomm_things. + */ + if (tpl_n[atc->nthread-1] > tpl->nalloc) + { + tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]); + srenew(tpl->i,tpl->nalloc); + } + /* Set tpl_n to the cumulative start */ + for(i=atc->nthread-1; i>=1; i--) + { + tpl_n[i] = tpl_n[i-1]; + } + tpl_n[0] = 0; + + /* Fill our thread local array with indices sorted on thread */ + for(i=start; ii[tpl_n[atc->thread_idx[i]]++] = i; + } + /* Now tpl_n contains the cummulative count again */ + } +} + +static void make_thread_local_ind(pme_atomcomm_t *atc, + int thread,splinedata_t *spline) +{ + int n,t,i,start,end; + thread_plist_t *tpl; + + /* Combine the indices made by each thread into one index */ + + n = 0; + start = 0; + for(t=0; tnthread; t++) + { + tpl = &atc->thread_plist[t]; + /* Copy our part (start - end) from the list of thread t */ + if (thread > 0) + { + start = tpl->n[thread-1]; + } + end = tpl->n[thread]; + for(i=start; iind[n++] = tpl->i[i]; + } + } + + spline->n = n; +} + + +static void pme_calc_pidx(int start, int end, + matrix recipbox, rvec x[], + pme_atomcomm_t *atc, int *count) +{ + int nslab,i; + int si; + real *xptr,s; + real rxx,ryx,rzx,ryy,rzy; + int *pd; + + /* Calculate PME task index (pidx) for each grid index. + * Here we always assign equally sized slabs to each node + * for load balancing reasons (the PME grid spacing is not used). + */ + + nslab = atc->nslab; + pd = atc->pd; + + /* Reset the count */ + for(i=0; idimind == 0) + { + rxx = recipbox[XX][XX]; + ryx = recipbox[YY][XX]; + rzx = recipbox[ZZ][XX]; + /* Calculate the node index in x-dimension */ + for(i=start; inthread; + +#pragma omp parallel for num_threads(nthread) schedule(static) + for(thread=0; threadcount_thread[thread]); + } + /* Non-parallel reduction, since nslab is small */ + + for(thread=1; threadnslab; slab++) + { + atc->count_thread[0][slab] += atc->count_thread[thread][slab]; + } + } +} + +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc) +{ + int i,d; + + srenew(spline->ind,atc->nalloc); + /* Initialize the index to identity so it works without threads */ + for(i=0; inalloc; i++) + { + spline->ind[i] = i; + } + + for(d=0;dtheta[d] ,atc->pme_order*atc->nalloc); + srenew(spline->dtheta[d],atc->pme_order*atc->nalloc); + } +} + +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc) +{ + int nalloc_old,i,j,nalloc_tpl; + + /* We have to avoid a NULL pointer for atc->x to avoid + * possible fatal errors in MPI routines. + */ + if (atc->n > atc->nalloc || atc->nalloc == 0) + { + nalloc_old = atc->nalloc; + atc->nalloc = over_alloc_dd(max(atc->n,1)); + + if (atc->nslab > 1) { + srenew(atc->x,atc->nalloc); + srenew(atc->q,atc->nalloc); + srenew(atc->f,atc->nalloc); + for(i=nalloc_old; inalloc; i++) + { + clear_rvec(atc->f[i]); + } + } + if (atc->bSpread) { + srenew(atc->fractx,atc->nalloc); + srenew(atc->idx ,atc->nalloc); + + if (atc->nthread > 1) + { + srenew(atc->thread_idx,atc->nalloc); + } + + for(i=0; inthread; i++) + { + pme_realloc_splinedata(&atc->spline[i],atc); + } + } + } +} + +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw, + int n, gmx_bool bXF, rvec *x_f, real *charge, + pme_atomcomm_t *atc) +/* Redistribute particle data for PME calculation */ +/* domain decomposition by x coordinate */ +{ + int *idxa; + int i, ii; + + if(FALSE == pme->redist_init) { + snew(pme->scounts,atc->nslab); + snew(pme->rcounts,atc->nslab); + snew(pme->sdispls,atc->nslab); + snew(pme->rdispls,atc->nslab); + snew(pme->sidx,atc->nslab); + pme->redist_init = TRUE; + } + if (n > pme->redist_buf_nalloc) { + pme->redist_buf_nalloc = over_alloc_dd(n); + srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM); + } + + pme->idxa = atc->pd; + +#ifdef GMX_MPI + if (forw && bXF) { + /* forward, redistribution from pp to pme */ + + /* Calculate send counts and exchange them with other nodes */ + for(i=0; (inslab); i++) pme->scounts[i]=0; + for(i=0; (iscounts[pme->idxa[i]]++; + MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm); + + /* Calculate send and receive displacements and index into send + buffer */ + pme->sdispls[0]=0; + pme->rdispls[0]=0; + pme->sidx[0]=0; + for(i=1; inslab; i++) { + pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1]; + pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1]; + pme->sidx[i]=pme->sdispls[i]; + } + /* Total # of particles to be received */ + atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1]; + + pme_realloc_atomcomm_things(atc); + + /* Copy particle coordinates into send buffer and exchange*/ + for(i=0; (isidx[pme->idxa[i]]; + pme->sidx[pme->idxa[i]]++; + pme->redist_buf[ii+XX]=x_f[i][XX]; + pme->redist_buf[ii+YY]=x_f[i][YY]; + pme->redist_buf[ii+ZZ]=x_f[i][ZZ]; + } + MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, + pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls, + pme->rvec_mpi, atc->mpi_comm); + } + if (forw) { + /* Copy charge into send buffer and exchange*/ + for(i=0; inslab; i++) pme->sidx[i]=pme->sdispls[i]; + for(i=0; (isidx[pme->idxa[i]]; + pme->sidx[pme->idxa[i]]++; + pme->redist_buf[ii]=charge[i]; + } + MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type, + atc->q, pme->rcounts, pme->rdispls, mpi_type, + atc->mpi_comm); + } + else { /* backward, redistribution from pme to pp */ + MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi, + pme->redist_buf, pme->scounts, pme->sdispls, + pme->rvec_mpi, atc->mpi_comm); + + /* Copy data from receive buffer */ + for(i=0; inslab; i++) + pme->sidx[i] = pme->sdispls[i]; + for(i=0; (isidx[pme->idxa[i]]; + x_f[i][XX] += pme->redist_buf[ii+XX]; + x_f[i][YY] += pme->redist_buf[ii+YY]; + x_f[i][ZZ] += pme->redist_buf[ii+ZZ]; + pme->sidx[pme->idxa[i]]++; + } + } +#endif +} + +static void pme_dd_sendrecv(pme_atomcomm_t *atc, + gmx_bool bBackward,int shift, + void *buf_s,int nbyte_s, + void *buf_r,int nbyte_r) +{ +#ifdef GMX_MPI + int dest,src; + MPI_Status stat; + + if (bBackward == FALSE) { + dest = atc->node_dest[shift]; + src = atc->node_src[shift]; + } else { + dest = atc->node_src[shift]; + src = atc->node_dest[shift]; + } + + if (nbyte_s > 0 && nbyte_r > 0) { + MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE, + dest,shift, + buf_r,nbyte_r,MPI_BYTE, + src,shift, + atc->mpi_comm,&stat); + } else if (nbyte_s > 0) { + MPI_Send(buf_s,nbyte_s,MPI_BYTE, + dest,shift, + atc->mpi_comm); + } else if (nbyte_r > 0) { + MPI_Recv(buf_r,nbyte_r,MPI_BYTE, + src,shift, + atc->mpi_comm,&stat); + } +#endif +} + +static void dd_pmeredist_x_q(gmx_pme_t pme, + int n, gmx_bool bX, rvec *x, real *charge, + pme_atomcomm_t *atc) +{ + int *commnode,*buf_index; + int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount; + + commnode = atc->node_dest; + buf_index = atc->buf_index; + + nnodes_comm = min(2*atc->maxshift,atc->nslab-1); + + nsend = 0; + for(i=0; icount[commnode[i]]; + } + if (bX) { + if (atc->count[atc->nodeid] + nsend != n) + gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n" + "This usually means that your system is not well equilibrated.", + n - (atc->count[atc->nodeid] + nsend), + pme->nodeid,'x'+atc->dimind); + + if (nsend > pme->buf_nalloc) { + pme->buf_nalloc = over_alloc_dd(nsend); + srenew(pme->bufv,pme->buf_nalloc); + srenew(pme->bufr,pme->buf_nalloc); + } + + atc->n = atc->count[atc->nodeid]; + for(i=0; icount[commnode[i]]; + /* Communicate the count */ + if (debug) + fprintf(debug,"dimind %d PME node %d send to node %d: %d\n", + atc->dimind,atc->nodeid,commnode[i],scount); + pme_dd_sendrecv(atc,FALSE,i, + &scount,sizeof(int), + &atc->rcount[i],sizeof(int)); + atc->n += atc->rcount[i]; + } + + pme_realloc_atomcomm_things(atc); + } + + local_pos = 0; + for(i=0; ipd[i]; + if (node == atc->nodeid) { + /* Copy direct to the receive buffer */ + if (bX) { + copy_rvec(x[i],atc->x[local_pos]); + } + atc->q[local_pos] = charge[i]; + local_pos++; + } else { + /* Copy to the send buffer */ + if (bX) { + copy_rvec(x[i],pme->bufv[buf_index[node]]); + } + pme->bufr[buf_index[node]] = charge[i]; + buf_index[node]++; + } + } + + buf_pos = 0; + for(i=0; icount[commnode[i]]; + rcount = atc->rcount[i]; + if (scount > 0 || rcount > 0) { + if (bX) { + /* Communicate the coordinates */ + pme_dd_sendrecv(atc,FALSE,i, + pme->bufv[buf_pos],scount*sizeof(rvec), + atc->x[local_pos],rcount*sizeof(rvec)); + } + /* Communicate the charges */ + pme_dd_sendrecv(atc,FALSE,i, + pme->bufr+buf_pos,scount*sizeof(real), + atc->q+local_pos,rcount*sizeof(real)); + buf_pos += scount; + local_pos += atc->rcount[i]; + } + } +} + +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc, + int n, rvec *f, + gmx_bool bAddF) +{ + int *commnode,*buf_index; + int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node; + + commnode = atc->node_dest; + buf_index = atc->buf_index; + + nnodes_comm = min(2*atc->maxshift,atc->nslab-1); + + local_pos = atc->count[atc->nodeid]; + buf_pos = 0; + for(i=0; ircount[i]; + rcount = atc->count[commnode[i]]; + if (scount > 0 || rcount > 0) { + /* Communicate the forces */ + pme_dd_sendrecv(atc,TRUE,i, + atc->f[local_pos],scount*sizeof(rvec), + pme->bufv[buf_pos],rcount*sizeof(rvec)); + local_pos += scount; + } + buf_index[commnode[i]] = buf_pos; + buf_pos += rcount; + } + + local_pos = 0; + if (bAddF) + { + for(i=0; ipd[i]; + if (node == atc->nodeid) + { + /* Add from the local force array */ + rvec_inc(f[i],atc->f[local_pos]); + local_pos++; + } + else + { + /* Add from the receive buffer */ + rvec_inc(f[i],pme->bufv[buf_index[node]]); + buf_index[node]++; + } + } + } + else + { + for(i=0; ipd[i]; + if (node == atc->nodeid) + { + /* Copy from the local force array */ + copy_rvec(atc->f[local_pos],f[i]); + local_pos++; + } + else + { + /* Copy from the receive buffer */ + copy_rvec(pme->bufv[buf_index[node]],f[i]); + buf_index[node]++; + } + } + } +} + +#ifdef GMX_MPI +static void +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction) +{ + pme_overlap_t *overlap; + int send_index0,send_nindex; + int recv_index0,recv_nindex; + MPI_Status stat; + int i,j,k,ix,iy,iz,icnt; + int ipulse,send_id,recv_id,datasize; + real *p; + real *sendptr,*recvptr; + + /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */ + overlap = &pme->overlap[1]; + + for(ipulse=0;ipulsenoverlap_nodes;ipulse++) + { + /* Since we have already (un)wrapped the overlap in the z-dimension, + * we only have to communicate 0 to nkz (not pmegrid_nz). + */ + if (direction==GMX_SUM_QGRID_FORWARD) + { + send_id = overlap->send_id[ipulse]; + recv_id = overlap->recv_id[ipulse]; + send_index0 = overlap->comm_data[ipulse].send_index0; + send_nindex = overlap->comm_data[ipulse].send_nindex; + recv_index0 = overlap->comm_data[ipulse].recv_index0; + recv_nindex = overlap->comm_data[ipulse].recv_nindex; + } + else + { + send_id = overlap->recv_id[ipulse]; + recv_id = overlap->send_id[ipulse]; + send_index0 = overlap->comm_data[ipulse].recv_index0; + send_nindex = overlap->comm_data[ipulse].recv_nindex; + recv_index0 = overlap->comm_data[ipulse].send_index0; + recv_nindex = overlap->comm_data[ipulse].send_nindex; + } + + /* Copy data to contiguous send buffer */ + if (debug) + { + fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n", + pme->nodeid,overlap->nodeid,send_id, + pme->pmegrid_start_iy, + send_index0-pme->pmegrid_start_iy, + send_index0-pme->pmegrid_start_iy+send_nindex); + } + icnt = 0; + for(i=0;ipmegrid_nx;i++) + { + ix = i; + for(j=0;jpmegrid_start_iy; + for(k=0;knkz;k++) + { + iz = k; + overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]; + } + } + } + + datasize = pme->pmegrid_nx * pme->nkz; + + MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL, + send_id,ipulse, + overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL, + recv_id,ipulse, + overlap->mpi_comm,&stat); + + /* Get data from contiguous recv buffer */ + if (debug) + { + fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n", + pme->nodeid,overlap->nodeid,recv_id, + pme->pmegrid_start_iy, + recv_index0-pme->pmegrid_start_iy, + recv_index0-pme->pmegrid_start_iy+recv_nindex); + } + icnt = 0; + for(i=0;ipmegrid_nx;i++) + { + ix = i; + for(j=0;jpmegrid_start_iy; + for(k=0;knkz;k++) + { + iz = k; + if(direction==GMX_SUM_QGRID_FORWARD) + { + grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++]; + } + else + { + grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++]; + } + } + } + } + } + + /* Major dimension is easier, no copying required, + * but we might have to sum to separate array. + * Since we don't copy, we have to communicate up to pmegrid_nz, + * not nkz as for the minor direction. + */ + overlap = &pme->overlap[0]; + + for(ipulse=0;ipulsenoverlap_nodes;ipulse++) + { + if(direction==GMX_SUM_QGRID_FORWARD) + { + send_id = overlap->send_id[ipulse]; + recv_id = overlap->recv_id[ipulse]; + send_index0 = overlap->comm_data[ipulse].send_index0; + send_nindex = overlap->comm_data[ipulse].send_nindex; + recv_index0 = overlap->comm_data[ipulse].recv_index0; + recv_nindex = overlap->comm_data[ipulse].recv_nindex; + recvptr = overlap->recvbuf; + } + else + { + send_id = overlap->recv_id[ipulse]; + recv_id = overlap->send_id[ipulse]; + send_index0 = overlap->comm_data[ipulse].recv_index0; + send_nindex = overlap->comm_data[ipulse].recv_nindex; + recv_index0 = overlap->comm_data[ipulse].send_index0; + recv_nindex = overlap->comm_data[ipulse].send_nindex; + recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); + } + + sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); + datasize = pme->pmegrid_ny * pme->pmegrid_nz; + + if (debug) + { + fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n", + pme->nodeid,overlap->nodeid,send_id, + pme->pmegrid_start_ix, + send_index0-pme->pmegrid_start_ix, + send_index0-pme->pmegrid_start_ix+send_nindex); + fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n", + pme->nodeid,overlap->nodeid,recv_id, + pme->pmegrid_start_ix, + recv_index0-pme->pmegrid_start_ix, + recv_index0-pme->pmegrid_start_ix+recv_nindex); + } + + MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL, + send_id,ipulse, + recvptr,recv_nindex*datasize,GMX_MPI_REAL, + recv_id,ipulse, + overlap->mpi_comm,&stat); + + /* ADD data from contiguous recv buffer */ + if(direction==GMX_SUM_QGRID_FORWARD) + { + p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); + for(i=0;irecvbuf[i]; + } + } + } +} +#endif + + +static int +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid) +{ + ivec local_fft_ndata,local_fft_offset,local_fft_size; + ivec local_pme_size; + int i,ix,iy,iz; + int pmeidx,fftidx; + + /* Dimensions should be identical for A/B grid, so we just use A here */ + gmx_parallel_3dfft_real_limits(pme->pfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + + local_pme_size[0] = pme->pmegrid_nx; + local_pme_size[1] = pme->pmegrid_ny; + local_pme_size[2] = pme->pmegrid_nz; + + /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, + the offset is identical, and the PME grid always has more data (due to overlap) + */ + { +#ifdef DEBUG_PME + FILE *fp,*fp2; + char fn[STRLEN],format[STRLEN]; + real val; + sprintf(fn,"pmegrid%d.pdb",pme->nodeid); + fp = ffopen(fn,"w"); + sprintf(fn,"pmegrid%d.txt",pme->nodeid); + fp2 = ffopen(fn,"w"); + sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f"); +#endif + + for(ix=0;ixpmegrid_start_ix + ix, + pme->pmegrid_start_iy + iy, + pme->pmegrid_start_iz + iz, + pmegrid[pmeidx]); +#endif + } + } + } +#ifdef DEBUG_PME + ffclose(fp); + ffclose(fp2); +#endif + } + return 0; +} + + +static gmx_cycles_t omp_cyc_start() +{ + return gmx_cycles_read(); +} + +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c) +{ + return gmx_cycles_read() - c; +} + + +static int +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, + int nthread,int thread) +{ + ivec local_fft_ndata,local_fft_offset,local_fft_size; + ivec local_pme_size; + int ixy0,ixy1,ixy,ix,iy,iz; + int pmeidx,fftidx; +#ifdef PME_TIME_THREADS + gmx_cycles_t c1; + static double cs1=0; + static int cnt=0; +#endif + +#ifdef PME_TIME_THREADS + c1 = omp_cyc_start(); +#endif + /* Dimensions should be identical for A/B grid, so we just use A here */ + gmx_parallel_3dfft_real_limits(pme->pfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + + local_pme_size[0] = pme->pmegrid_nx; + local_pme_size[1] = pme->pmegrid_ny; + local_pme_size[2] = pme->pmegrid_nz; + + /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, + the offset is identical, and the PME grid always has more data (due to overlap) + */ + ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread; + ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread; + + for(ixy=ixy0;ixynkx; + ny = pme->nky; + nz = pme->nkz; + + pnx = pme->pmegrid_nx; + pny = pme->pmegrid_ny; + pnz = pme->pmegrid_nz; + + overlap = pme->pme_order - 1; + + /* Add periodic overlap in z */ + for(ix=0; ixpmegrid_nx; ix++) + { + for(iy=0; iypmegrid_ny; iy++) + { + for(iz=0; iznnodes_minor == 1) + { + for(ix=0; ixpmegrid_nx; ix++) + { + for(iy=0; iynnodes_major == 1) + { + ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny); + + for(ix=0; ixnkx; + ny = pme->nky; + nz = pme->nkz; + + pnx = pme->pmegrid_nx; + pny = pme->pmegrid_ny; + pnz = pme->pmegrid_nz; + + overlap = pme->pme_order - 1; + + if (pme->nnodes_major == 1) + { + ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny); + + for(ix=0; ixnnodes_minor == 1) + { +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for(ix=0; ixpmegrid_nx; ix++) + { + int iy,iz; + + for(iy=0; iynthread) schedule(static) + for(ix=0; ixpmegrid_nx; ix++) + { + int iy,iz; + + for(iy=0; iypmegrid_ny; iy++) + { + for(iz=0; izn[XX]; + pny = pmegrid->n[YY]; + pnz = pmegrid->n[ZZ]; + + offx = pmegrid->offset[XX]; + offy = pmegrid->offset[YY]; + offz = pmegrid->offset[ZZ]; + + ndatatot = pnx*pny*pnz; + grid = pmegrid->grid; + for(i=0;iorder; + + for(nn=0; nnn; nn++) + { + n = spline->ind[nn]; + qn = atc->q[n]; + + if (qn != 0) + { + idxptr = atc->idx[n]; + norder = nn*order; + + i0 = idxptr[XX] - offx; + j0 = idxptr[YY] - offy; + k0 = idxptr[ZZ] - offz; + + thx = spline->theta[XX] + norder; + thy = spline->theta[YY] + norder; + thz = spline->theta[ZZ] + norder; + + switch (order) { + case 4: +#ifdef PME_SSE +#ifdef PME_SSE_UNALIGNED +#define PME_SPREAD_SSE_ORDER4 +#else +#define PME_SPREAD_SSE_ALIGNED +#define PME_ORDER 4 +#endif +#include "pme_sse_single.h" +#else + DO_BSPLINE(4); +#endif + break; + case 5: +#ifdef PME_SSE +#define PME_SPREAD_SSE_ALIGNED +#define PME_ORDER 5 +#include "pme_sse_single.h" +#else + DO_BSPLINE(5); +#endif + break; + default: + DO_BSPLINE(order); + break; + } + } + } +} + +static void set_grid_alignment(int *pmegrid_nz,int pme_order) +{ +#ifdef PME_SSE + if (pme_order == 5 +#ifndef PME_SSE_UNALIGNED + || pme_order == 4 +#endif + ) + { + /* Round nz up to a multiple of 4 to ensure alignment */ + *pmegrid_nz = ((*pmegrid_nz + 3) & ~3); + } +#endif +} + +static void set_gridsize_alignment(int *gridsize,int pme_order) +{ +#ifdef PME_SSE +#ifndef PME_SSE_UNALIGNED + if (pme_order == 4) + { + /* Add extra elements to ensured aligned operations do not go + * beyond the allocated grid size. + * Note that for pme_order=5, the pme grid z-size alignment + * ensures that we will not go beyond the grid size. + */ + *gridsize += 4; + } +#endif +#endif +} + +static void pmegrid_init(pmegrid_t *grid, + int cx, int cy, int cz, + int x0, int y0, int z0, + int x1, int y1, int z1, + gmx_bool set_alignment, + int pme_order, + real *ptr) +{ + int nz,gridsize; + + grid->ci[XX] = cx; + grid->ci[YY] = cy; + grid->ci[ZZ] = cz; + grid->offset[XX] = x0; + grid->offset[YY] = y0; + grid->offset[ZZ] = z0; + grid->n[XX] = x1 - x0 + pme_order - 1; + grid->n[YY] = y1 - y0 + pme_order - 1; + grid->n[ZZ] = z1 - z0 + pme_order - 1; + + nz = grid->n[ZZ]; + set_grid_alignment(&nz,pme_order); + if (set_alignment) + { + grid->n[ZZ] = nz; + } + else if (nz != grid->n[ZZ]) + { + gmx_incons("pmegrid_init call with an unaligned z size"); + } + + grid->order = pme_order; + if (ptr == NULL) + { + gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ]; + set_gridsize_alignment(&gridsize,pme_order); + snew_aligned(grid->grid,gridsize,16); + } + else + { + grid->grid = ptr; + } +} + +static int div_round_up(int enumerator,int denominator) +{ + return (enumerator + denominator - 1)/denominator; +} + +static void make_subgrid_division(const ivec n,int ovl,int nthread, + ivec nsub) +{ + int gsize_opt,gsize; + int nsx,nsy,nsz; + char *env; + + gsize_opt = -1; + for(nsx=1; nsx<=nthread; nsx++) + { + if (nthread % nsx == 0) + { + for(nsy=1; nsy<=nthread; nsy++) + { + if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0) + { + nsz = nthread/(nsx*nsy); + + /* Determine the number of grid points per thread */ + gsize = + (div_round_up(n[XX],nsx) + ovl)* + (div_round_up(n[YY],nsy) + ovl)* + (div_round_up(n[ZZ],nsz) + ovl); + + /* Minimize the number of grids points per thread + * and, secondarily, the number of cuts in minor dimensions. + */ + if (gsize_opt == -1 || + gsize < gsize_opt || + (gsize == gsize_opt && + (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY])))) + { + nsub[XX] = nsx; + nsub[YY] = nsy; + nsub[ZZ] = nsz; + gsize_opt = gsize; + } + } + } + } + } + + env = getenv("GMX_PME_THREAD_DIVISION"); + if (env != NULL) + { + sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]); + } + + if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread) + { + gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread); + } +} + +static void pmegrids_init(pmegrids_t *grids, + int nx,int ny,int nz,int nz_base, + int pme_order, + int nthread, + int overlap_x, + int overlap_y) +{ + ivec n,n_base,g0,g1; + int t,x,y,z,d,i,tfac; + int max_comm_lines; + + n[XX] = nx - (pme_order - 1); + n[YY] = ny - (pme_order - 1); + n[ZZ] = nz - (pme_order - 1); + + copy_ivec(n,n_base); + n_base[ZZ] = nz_base; + + pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order, + NULL); + + grids->nthread = nthread; + + make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc); + + if (grids->nthread > 1) + { + ivec nst; + int gridsize; + real *grid_all; + + for(d=0; dnc[d]) + pme_order - 1; + } + set_grid_alignment(&nst[ZZ],pme_order); + + if (debug) + { + fprintf(debug,"pmegrid thread local division: %d x %d x %d\n", + grids->nc[XX],grids->nc[YY],grids->nc[ZZ]); + fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n", + nx,ny,nz, + nst[XX],nst[YY],nst[ZZ]); + } + + snew(grids->grid_th,grids->nthread); + t = 0; + gridsize = nst[XX]*nst[YY]*nst[ZZ]; + set_gridsize_alignment(&gridsize,pme_order); + snew_aligned(grid_all, + grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP, + 16); + + for(x=0; xnc[XX]; x++) + { + for(y=0; ync[YY]; y++) + { + for(z=0; znc[ZZ]; z++) + { + pmegrid_init(&grids->grid_th[t], + x,y,z, + (n[XX]*(x ))/grids->nc[XX], + (n[YY]*(y ))/grids->nc[YY], + (n[ZZ]*(z ))/grids->nc[ZZ], + (n[XX]*(x+1))/grids->nc[XX], + (n[YY]*(y+1))/grids->nc[YY], + (n[ZZ]*(z+1))/grids->nc[ZZ], + TRUE, + pme_order, + grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP)); + t++; + } + } + } + } + + snew(grids->g2t,DIM); + tfac = 1; + for(d=DIM-1; d>=0; d--) + { + snew(grids->g2t[d],n[d]); + t = 0; + for(i=0; inc[d] && i >= (n[d]*(t+1))/grids->nc[d]) + { + t++; + } + grids->g2t[d][i] = t*tfac; + } + + tfac *= grids->nc[d]; + + switch (d) + { + case XX: max_comm_lines = overlap_x; break; + case YY: max_comm_lines = overlap_y; break; + case ZZ: max_comm_lines = pme_order - 1; break; + } + grids->nthread_comm[d] = 0; + while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines) + { + grids->nthread_comm[d]++; + } + if (debug != NULL) + { + fprintf(debug,"pmegrid thread grid communication range in %c: %d\n", + 'x'+d,grids->nthread_comm[d]); + } + /* It should be possible to make grids->nthread_comm[d]==grids->nc[d] + * work, but this is not a problematic restriction. + */ + if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d]) + { + gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread); + } + } +} + + +static void pmegrids_destroy(pmegrids_t *grids) +{ + int t; + + if (grids->grid.grid != NULL) + { + sfree(grids->grid.grid); + + if (grids->nthread > 0) + { + for(t=0; tnthread; t++) + { + sfree(grids->grid_th[t].grid); + } + sfree(grids->grid_th); + } + } +} + + +static void realloc_work(pme_work_t *work,int nkx) +{ + if (nkx > work->nalloc) + { + work->nalloc = nkx; + srenew(work->mhx ,work->nalloc); + srenew(work->mhy ,work->nalloc); + srenew(work->mhz ,work->nalloc); + srenew(work->m2 ,work->nalloc); + /* Allocate an aligned pointer for SSE operations, including 3 extra + * elements at the end since SSE operates on 4 elements at a time. + */ + sfree_aligned(work->denom); + sfree_aligned(work->tmp1); + sfree_aligned(work->eterm); + snew_aligned(work->denom,work->nalloc+3,16); + snew_aligned(work->tmp1 ,work->nalloc+3,16); + snew_aligned(work->eterm,work->nalloc+3,16); + srenew(work->m2inv,work->nalloc); + } +} + + +static void free_work(pme_work_t *work) +{ + sfree(work->mhx); + sfree(work->mhy); + sfree(work->mhz); + sfree(work->m2); + sfree_aligned(work->denom); + sfree_aligned(work->tmp1); + sfree_aligned(work->eterm); + sfree(work->m2inv); +} + + +#ifdef PME_SSE + /* Calculate exponentials through SSE in float precision */ +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned) +{ + { + const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f); + __m128 f_sse; + __m128 lu; + __m128 tmp_d1,d_inv,tmp_r,tmp_e; + int kx; + f_sse = _mm_load1_ps(&f); + for(kx=0; kxepsilon_r; + + nx = pme->nkx; + ny = pme->nky; + nz = pme->nkz; + + /* Dimensions should be identical for A/B grid, so we just use A here */ + gmx_parallel_3dfft_complex_limits(pme->pfft_setupA, + complex_order, + local_ndata, + local_offset, + local_size); + + rxx = pme->recipbox[XX][XX]; + ryx = pme->recipbox[YY][XX]; + ryy = pme->recipbox[YY][YY]; + rzx = pme->recipbox[ZZ][XX]; + rzy = pme->recipbox[ZZ][YY]; + rzz = pme->recipbox[ZZ][ZZ]; + + maxkx = (nx+1)/2; + maxky = (ny+1)/2; + maxkz = nz/2+1; + + work = &pme->work[thread]; + mhx = work->mhx; + mhy = work->mhy; + mhz = work->mhz; + m2 = work->m2; + denom = work->denom; + tmp1 = work->tmp1; + eterm = work->eterm; + m2inv = work->m2inv; + + iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread; + iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread; + + for(iyz=iyz0; iyzbsp_mod[YY][ky]; + + kz = iz + local_offset[ZZ]; + + mz = kz; + + bz = pme->bsp_mod[ZZ][kz]; + + /* 0.5 correction for corner points */ + corner_fac = 1; + if (kz == 0 || kz == (nz+1)/2) + { + corner_fac = 0.5; + } + + p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX]; + + /* We should skip the k-space point (0,0,0) */ + if (local_offset[XX] > 0 || ky > 0 || kz > 0) + { + kxstart = local_offset[XX]; + } + else + { + kxstart = local_offset[XX] + 1; + p0++; + } + kxend = local_offset[XX] + local_ndata[XX]; + + if (bEnerVir) + { + /* More expensive inner loop, especially because of the storage + * of the mh elements in array's. + * Because x is the minor grid index, all mh elements + * depend on kx for triclinic unit cells. + */ + + /* Two explicit loops to avoid a conditional inside the loop */ + for(kx=kxstart; kxbsp_mod[XX][kx]; + tmp1[kx] = -factor*m2k; + } + + for(kx=maxkx; kxbsp_mod[XX][kx]; + tmp1[kx] = -factor*m2k; + } + + for(kx=kxstart; kxre; + d2 = p0->im; + + p0->re = d1*eterm[kx]; + p0->im = d2*eterm[kx]; + + struct2 = 2.0*(d1*d1+d2*d2); + + tmp1[kx] = eterm[kx]*struct2; + } + + for(kx=kxstart; kxbsp_mod[XX][kx]; + tmp1[kx] = -factor*m2k; + } + + for(kx=maxkx; kxbsp_mod[XX][kx]; + tmp1[kx] = -factor*m2k; + } + + calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm); + + for(kx=kxstart; kxre; + d2 = p0->im; + + p0->re = d1*eterm[kx]; + p0->im = d2*eterm[kx]; + } + } + } + + if (bEnerVir) + { + /* Update virial with local values. + * The virial is symmetric by definition. + * this virial seems ok for isotropic scaling, but I'm + * experiencing problems on semiisotropic membranes. + * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07). + */ + work->vir[XX][XX] = 0.25*virxx; + work->vir[YY][YY] = 0.25*viryy; + work->vir[ZZ][ZZ] = 0.25*virzz; + work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy; + work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz; + work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz; + + /* This energy should be corrected for a charged system */ + work->energy = 0.5*energy; + } + + /* Return the loop count */ + return local_ndata[YY]*local_ndata[XX]; +} + +static void get_pme_ener_vir(const gmx_pme_t pme,int nthread, + real *mesh_energy,matrix vir) +{ + /* This function sums output over threads + * and should therefore only be called after thread synchronization. + */ + int thread; + + *mesh_energy = pme->work[0].energy; + copy_mat(pme->work[0].vir,vir); + + for(thread=1; threadwork[thread].energy; + m_add(vir,pme->work[thread].vir,vir); + } +} + +#define DO_FSPLINE(order) \ +for(ithx=0; (ithxspline_work; ++ work = pme->spline_work; + + order = pme->pme_order; + thx = spline->theta[XX]; + thy = spline->theta[YY]; + thz = spline->theta[ZZ]; + dthx = spline->dtheta[XX]; + dthy = spline->dtheta[YY]; + dthz = spline->dtheta[ZZ]; + nx = pme->nkx; + ny = pme->nky; + nz = pme->nkz; + pnx = pme->pmegrid_nx; + pny = pme->pmegrid_ny; + pnz = pme->pmegrid_nz; + + rxx = pme->recipbox[XX][XX]; + ryx = pme->recipbox[YY][XX]; + ryy = pme->recipbox[YY][YY]; + rzx = pme->recipbox[ZZ][XX]; + rzy = pme->recipbox[ZZ][YY]; + rzz = pme->recipbox[ZZ][ZZ]; + + for(nn=0; nnn; nn++) + { + n = spline->ind[nn]; + qn = scale*atc->q[n]; + + if (bClearF) + { + atc->f[n][XX] = 0; + atc->f[n][YY] = 0; + atc->f[n][ZZ] = 0; + } + if (qn != 0) + { + fx = 0; + fy = 0; + fz = 0; + idxptr = atc->idx[n]; + norder = nn*order; + + i0 = idxptr[XX]; + j0 = idxptr[YY]; + k0 = idxptr[ZZ]; + + /* Pointer arithmetic alert, next six statements */ + thx = spline->theta[XX] + norder; + thy = spline->theta[YY] + norder; + thz = spline->theta[ZZ] + norder; + dthx = spline->dtheta[XX] + norder; + dthy = spline->dtheta[YY] + norder; + dthz = spline->dtheta[ZZ] + norder; + + switch (order) { + case 4: +#ifdef PME_SSE +#ifdef PME_SSE_UNALIGNED +#define PME_GATHER_F_SSE_ORDER4 +#else +#define PME_GATHER_F_SSE_ALIGNED +#define PME_ORDER 4 +#endif +#include "pme_sse_single.h" +#else + DO_FSPLINE(4); +#endif + break; + case 5: +#ifdef PME_SSE +#define PME_GATHER_F_SSE_ALIGNED +#define PME_ORDER 5 +#include "pme_sse_single.h" +#else + DO_FSPLINE(5); +#endif + break; + default: + DO_FSPLINE(order); + break; + } + + atc->f[n][XX] += -qn*( fx*nx*rxx ); + atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy ); + atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz ); + } + } + /* Since the energy and not forces are interpolated + * the net force might not be exactly zero. + * This can be solved by also interpolating F, but + * that comes at a cost. + * A better hack is to remove the net force every + * step, but that must be done at a higher level + * since this routine doesn't see all atoms if running + * in parallel. Don't know how important it is? EL 990726 + */ +} + + +static real gather_energy_bsplines(gmx_pme_t pme,real *grid, + pme_atomcomm_t *atc) +{ + splinedata_t *spline; + int n,ithx,ithy,ithz,i0,j0,k0; + int index_x,index_xy; + int * idxptr; + real energy,pot,tx,ty,qn,gval; + real *thx,*thy,*thz; + int norder; + int order; + + spline = &atc->spline[0]; + + order = pme->pme_order; + + energy = 0; + for(n=0; (nn); n++) { + qn = atc->q[n]; + + if (qn != 0) { + idxptr = atc->idx[n]; + norder = n*order; + + i0 = idxptr[XX]; + j0 = idxptr[YY]; + k0 = idxptr[ZZ]; + + /* Pointer arithmetic alert, next three statements */ + thx = spline->theta[XX] + norder; + thy = spline->theta[YY] + norder; + thz = spline->theta[ZZ] + norder; + + pot = 0; + for(ithx=0; (ithxpmegrid_ny*pme->pmegrid_nz; + tx = thx[ithx]; + + for(ithy=0; (ithypmegrid_nz; + ty = thy[ithy]; + + for(ithz=0; (ithz 8) + { + gmx_fatal(FARGS,"The current P3M code only supports orders up to 8"); + } + + zarg = M_PI/n; + + maxk = (n + 1)/2; + + for(i=-maxk; i<0; i++) + { + zai = zarg*i; + sinzai = sin(zai); + infl = do_p3m_influence(sinzai,order); + bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order); + } + bsp_mod[0] = 1.0; + for(i=1; inslab; + + n = 0; + for(i=1; i<=nslab/2; i++) { + fw = (atc->nodeid + i) % nslab; + bw = (atc->nodeid - i + nslab) % nslab; + if (n < nslab - 1) { + atc->node_dest[n] = fw; + atc->node_src[n] = bw; + n++; + } + if (n < nslab - 1) { + atc->node_dest[n] = bw; + atc->node_src[n] = fw; + n++; + } + } +} + +int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata) +{ + int thread; + + if(NULL != log) + { + fprintf(log,"Destroying PME data structures.\n"); + } + + sfree((*pmedata)->nnx); + sfree((*pmedata)->nny); + sfree((*pmedata)->nnz); + + pmegrids_destroy(&(*pmedata)->pmegridA); + + sfree((*pmedata)->fftgridA); + sfree((*pmedata)->cfftgridA); + gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA); + + if ((*pmedata)->pmegridB.grid.grid != NULL) + { + pmegrids_destroy(&(*pmedata)->pmegridB); + sfree((*pmedata)->fftgridB); + sfree((*pmedata)->cfftgridB); + gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB); + } + for(thread=0; thread<(*pmedata)->nthread; thread++) + { + free_work(&(*pmedata)->work[thread]); + } + sfree((*pmedata)->work); + + sfree(*pmedata); + *pmedata = NULL; + + return 0; +} + +static int mult_up(int n,int f) +{ + return ((n + f - 1)/f)*f; +} + + +static double pme_load_imbalance(gmx_pme_t pme) +{ + int nma,nmi; + double n1,n2,n3; + + nma = pme->nnodes_major; + nmi = pme->nnodes_minor; + + n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz; + n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky; + n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx; + + /* pme_solve is roughly double the cost of an fft */ + + return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz); +} + +static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr, + int dimind,gmx_bool bSpread) +{ + int nk,k,s,thread; + + atc->dimind = dimind; + atc->nslab = 1; + atc->nodeid = 0; + atc->pd_nalloc = 0; +#ifdef GMX_MPI + if (pme->nnodes > 1) + { + atc->mpi_comm = pme->mpi_comm_d[dimind]; + MPI_Comm_size(atc->mpi_comm,&atc->nslab); + MPI_Comm_rank(atc->mpi_comm,&atc->nodeid); + } + if (debug) + { + fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid); + } +#endif + + atc->bSpread = bSpread; + atc->pme_order = pme->pme_order; + + if (atc->nslab > 1) + { + /* These three allocations are not required for particle decomp. */ + snew(atc->node_dest,atc->nslab); + snew(atc->node_src,atc->nslab); + setup_coordinate_communication(atc); + + snew(atc->count_thread,pme->nthread); + for(thread=0; threadnthread; thread++) + { + snew(atc->count_thread[thread],atc->nslab); + } + atc->count = atc->count_thread[0]; + snew(atc->rcount,atc->nslab); + snew(atc->buf_index,atc->nslab); + } + + atc->nthread = pme->nthread; + if (atc->nthread > 1) + { + snew(atc->thread_plist,atc->nthread); + } + snew(atc->spline,atc->nthread); + for(thread=0; threadnthread; thread++) + { + if (atc->nthread > 1) + { + snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP); + atc->thread_plist[thread].n += GMX_CACHE_SEP; + } + } +} + +static void +init_overlap_comm(pme_overlap_t * ol, + int norder, +#ifdef GMX_MPI + MPI_Comm comm, +#endif + int nnodes, + int nodeid, + int ndata, + int commplainsize) +{ + int lbnd,rbnd,maxlr,b,i; + int exten; + int nn,nk; + pme_grid_comm_t *pgc; + gmx_bool bCont; + int fft_start,fft_end,send_index1,recv_index1; + +#ifdef GMX_MPI + ol->mpi_comm = comm; +#endif + + ol->nnodes = nnodes; + ol->nodeid = nodeid; + + /* Linear translation of the PME grid wo'nt affect reciprocal space + * calculations, so to optimize we only interpolate "upwards", + * which also means we only have to consider overlap in one direction. + * I.e., particles on this node might also be spread to grid indices + * that belong to higher nodes (modulo nnodes) + */ + + snew(ol->s2g0,ol->nnodes+1); + snew(ol->s2g1,ol->nnodes); + if (debug) { fprintf(debug,"PME slab boundaries:"); } + for(i=0; is2g0[i] = ( i *ndata + 0 )/nnodes; + ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1; + + if (debug) + { + fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]); + } + } + ol->s2g0[nnodes] = ndata; + if (debug) { fprintf(debug,"\n"); } + + /* Determine with how many nodes we need to communicate the grid overlap */ + b = 0; + do + { + b++; + bCont = FALSE; + for(i=0; is2g1[i] > ol->s2g0[i+b]) || + (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata)) + { + bCont = TRUE; + } + } + } + while (bCont && b < nnodes); + ol->noverlap_nodes = b - 1; + + snew(ol->send_id,ol->noverlap_nodes); + snew(ol->recv_id,ol->noverlap_nodes); + for(b=0; bnoverlap_nodes; b++) + { + ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes; + ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes; + } + snew(ol->comm_data, ol->noverlap_nodes); + + for(b=0; bnoverlap_nodes; b++) + { + pgc = &ol->comm_data[b]; + /* Send */ + fft_start = ol->s2g0[ol->send_id[b]]; + fft_end = ol->s2g0[ol->send_id[b]+1]; + if (ol->send_id[b] < nodeid) + { + fft_start += ndata; + fft_end += ndata; + } + send_index1 = ol->s2g1[nodeid]; + send_index1 = min(send_index1,fft_end); + pgc->send_index0 = fft_start; + pgc->send_nindex = max(0,send_index1 - pgc->send_index0); + + /* We always start receiving to the first index of our slab */ + fft_start = ol->s2g0[ol->nodeid]; + fft_end = ol->s2g0[ol->nodeid+1]; + recv_index1 = ol->s2g1[ol->recv_id[b]]; + if (ol->recv_id[b] > nodeid) + { + recv_index1 -= ndata; + } + recv_index1 = min(recv_index1,fft_end); + pgc->recv_index0 = fft_start; + pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0); + } + + /* For non-divisible grid we need pme_order iso pme_order-1 */ + snew(ol->sendbuf,norder*commplainsize); + snew(ol->recvbuf,norder*commplainsize); +} + +static void +make_gridindex5_to_localindex(int n,int local_start,int local_range, + int **global_to_local, + real **fraction_shift) +{ + int i; + int * gtl; + real * fsh; + + snew(gtl,5*n); + snew(fsh,5*n); + for(i=0; (i<5*n); i++) + { + /* Determine the global to local grid index */ + gtl[i] = (i - local_start + n) % n; + /* For coordinates that fall within the local grid the fraction + * is correct, we don't need to shift it. + */ + fsh[i] = 0; + if (local_range < n) + { + /* Due to rounding issues i could be 1 beyond the lower or + * upper boundary of the local grid. Correct the index for this. + * If we shift the index, we need to shift the fraction by + * the same amount in the other direction to not affect + * the weights. + * Note that due to this shifting the weights at the end of + * the spline might change, but that will only involve values + * between zero and values close to the precision of a real, + * which is anyhow the accuracy of the whole mesh calculation. + */ + /* With local_range=0 we should not change i=local_start */ + if (i % n != local_start) + { + if (gtl[i] == n-1) + { + gtl[i] = 0; + fsh[i] = -1; + } + else if (gtl[i] == local_range) + { + gtl[i] = local_range - 1; + fsh[i] = 1; + } + } + } + } + + *global_to_local = gtl; + *fraction_shift = fsh; +} + - static void sse_mask_init(pme_spline_work_t *work,int order) ++static pme_spline_work_t *make_pme_spline_work(int order) +{ ++ pme_spline_work_t *work; ++ +#ifdef PME_SSE + float tmp[8]; + __m128 zero_SSE; + int of,i; + ++ snew_aligned(work,1,16); ++ + zero_SSE = _mm_setzero_ps(); + ++ /* Generate bit masks to mask out the unused grid entries, ++ * as we only operate on order of the 8 grid entries that are ++ * load into 2 SSE float registers. ++ */ + for(of=0; of<8-(order-1); of++) + { + for(i=0; i<8; i++) + { + tmp[i] = (i >= of && i < of+order ? 1 : 0); + } + work->mask_SSE0[of] = _mm_loadu_ps(tmp); + work->mask_SSE1[of] = _mm_loadu_ps(tmp+4); + work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE); + work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE); + } ++#else ++ work = NULL; +#endif ++ ++ return work; +} + +static void +gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk) +{ + int nk_new; + + if (*nk % nnodes != 0) + { + nk_new = nnodes*(*nk/nnodes + 1); + + if (2*nk_new >= 3*(*nk)) + { + gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).", + dim,*nk,dim,nnodes,dim); + } + + if (fplog != NULL) + { + fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n", + dim,*nk,dim,nnodes,dim,nk_new,dim); + } + + *nk = nk_new; + } +} + +int gmx_pme_init(gmx_pme_t * pmedata, + t_commrec * cr, + int nnodes_major, + int nnodes_minor, + t_inputrec * ir, + int homenr, + gmx_bool bFreeEnergy, + gmx_bool bReproducible, + int nthread) +{ + gmx_pme_t pme=NULL; + + pme_atomcomm_t *atc; + ivec ndata; + + if (debug) + fprintf(debug,"Creating PME data structures.\n"); + snew(pme,1); + + pme->redist_init = FALSE; + pme->sum_qgrid_tmp = NULL; + pme->sum_qgrid_dd_tmp = NULL; + pme->buf_nalloc = 0; + pme->redist_buf_nalloc = 0; + + pme->nnodes = 1; + pme->bPPnode = TRUE; + + pme->nnodes_major = nnodes_major; + pme->nnodes_minor = nnodes_minor; + +#ifdef GMX_MPI + if (nnodes_major*nnodes_minor > 1) + { + pme->mpi_comm = cr->mpi_comm_mygroup; + + MPI_Comm_rank(pme->mpi_comm,&pme->nodeid); + MPI_Comm_size(pme->mpi_comm,&pme->nnodes); + if (pme->nnodes != nnodes_major*nnodes_minor) + { + gmx_incons("PME node count mismatch"); + } + } + else + { + pme->mpi_comm = MPI_COMM_NULL; + } +#endif + + if (pme->nnodes == 1) + { + pme->ndecompdim = 0; + pme->nodeid_major = 0; + pme->nodeid_minor = 0; +#ifdef GMX_MPI + pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL; +#endif + } + else + { + if (nnodes_minor == 1) + { +#ifdef GMX_MPI + pme->mpi_comm_d[0] = pme->mpi_comm; + pme->mpi_comm_d[1] = MPI_COMM_NULL; +#endif + pme->ndecompdim = 1; + pme->nodeid_major = pme->nodeid; + pme->nodeid_minor = 0; + + } + else if (nnodes_major == 1) + { +#ifdef GMX_MPI + pme->mpi_comm_d[0] = MPI_COMM_NULL; + pme->mpi_comm_d[1] = pme->mpi_comm; +#endif + pme->ndecompdim = 1; + pme->nodeid_major = 0; + pme->nodeid_minor = pme->nodeid; + } + else + { + if (pme->nnodes % nnodes_major != 0) + { + gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension"); + } + pme->ndecompdim = 2; + +#ifdef GMX_MPI + MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor, + pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */ + MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor, + pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */ + + MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major); + MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major); + MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor); + MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor); +#endif + } + pme->bPPnode = (cr->duty & DUTY_PP); + } + + pme->nthread = nthread; + + if (ir->ePBC == epbcSCREW) + { + gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw"); + } + + pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy); + pme->nkx = ir->nkx; + pme->nky = ir->nky; + pme->nkz = ir->nkz; + pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL); + pme->pme_order = ir->pme_order; + pme->epsilon_r = ir->epsilon_r; + + if (pme->pme_order > PME_ORDER_MAX) + { + gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.", + pme->pme_order,PME_ORDER_MAX); + } + + /* Currently pme.c supports only the fft5d FFT code. + * Therefore the grid always needs to be divisible by nnodes. + * When the old 1D code is also supported again, change this check. + * + * This check should be done before calling gmx_pme_init + * and fplog should be passed iso stderr. + * + if (pme->ndecompdim >= 2) + */ + if (pme->ndecompdim >= 1) + { + /* + gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL, + 'x',nnodes_major,&pme->nkx); + gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL, + 'y',nnodes_minor,&pme->nky); + */ + } + + if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) || + pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) || + pme->nkz <= pme->pme_order) + { + gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order); + } + + if (pme->nnodes > 1) { + double imbal; + +#ifdef GMX_MPI + MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi)); + MPI_Type_commit(&(pme->rvec_mpi)); +#endif + + /* Note that the charge spreading and force gathering, which usually + * takes about the same amount of time as FFT+solve_pme, + * is always fully load balanced + * (unless the charge distribution is inhomogeneous). + */ + + imbal = pme_load_imbalance(pme); + if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0) + { + fprintf(stderr, + "\n" + "NOTE: The load imbalance in PME FFT and solve is %d%%.\n" + " For optimal PME load balancing\n" + " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n" + " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n" + "\n", + (int)((imbal-1)*100 + 0.5), + pme->nkx,pme->nky,pme->nnodes_major, + pme->nky,pme->nkz,pme->nnodes_minor); + } + } + + /* For non-divisible grid we need pme_order iso pme_order-1 */ + /* In sum_qgrid_dd x overlap is copied in place: take padding into account. + * y is always copied through a buffer: we don't need padding in z, + * but we do need the overlap in x because of the communication order. + */ + init_overlap_comm(&pme->overlap[0],pme->pme_order, +#ifdef GMX_MPI + pme->mpi_comm_d[0], +#endif + pme->nnodes_major,pme->nodeid_major, + pme->nkx, + (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1)); + + init_overlap_comm(&pme->overlap[1],pme->pme_order, +#ifdef GMX_MPI + pme->mpi_comm_d[1], +#endif + pme->nnodes_minor,pme->nodeid_minor, + pme->nky, + (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz); + + /* Check for a limitation of the (current) sum_fftgrid_dd code */ + if (pme->nthread > 1 && + (pme->overlap[0].noverlap_nodes > 1 || + pme->overlap[1].noverlap_nodes > 1)) + { + gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order); + } + + snew(pme->bsp_mod[XX],pme->nkx); + snew(pme->bsp_mod[YY],pme->nky); + snew(pme->bsp_mod[ZZ],pme->nkz); + + /* The required size of the interpolation grid, including overlap. + * The allocated size (pmegrid_n?) might be slightly larger. + */ + pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] - + pme->overlap[0].s2g0[pme->nodeid_major]; + pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] - + pme->overlap[1].s2g0[pme->nodeid_minor]; + pme->pmegrid_nz_base = pme->nkz; + pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1; + set_grid_alignment(&pme->pmegrid_nz,pme->pme_order); + + pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major]; + pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor]; + pme->pmegrid_start_iz = 0; + + make_gridindex5_to_localindex(pme->nkx, + pme->pmegrid_start_ix, + pme->pmegrid_nx - (pme->pme_order-1), + &pme->nnx,&pme->fshx); + make_gridindex5_to_localindex(pme->nky, + pme->pmegrid_start_iy, + pme->pmegrid_ny - (pme->pme_order-1), + &pme->nny,&pme->fshy); + make_gridindex5_to_localindex(pme->nkz, + pme->pmegrid_start_iz, + pme->pmegrid_nz_base, + &pme->nnz,&pme->fshz); + + pmegrids_init(&pme->pmegridA, + pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz, + pme->pmegrid_nz_base, + pme->pme_order, + pme->nthread, + pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1], + pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]); + - sse_mask_init(&pme->spline_work,pme->pme_order); ++ pme->spline_work = make_pme_spline_work(pme->pme_order); + + ndata[0] = pme->nkx; + ndata[1] = pme->nky; + ndata[2] = pme->nkz; + + /* This routine will allocate the grid data to fit the FFTs */ + gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata, + &pme->fftgridA,&pme->cfftgridA, + pme->mpi_comm_d, + pme->overlap[0].s2g0,pme->overlap[1].s2g0, + bReproducible,pme->nthread); + + if (bFreeEnergy) + { + pmegrids_init(&pme->pmegridB, + pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz, + pme->pmegrid_nz_base, + pme->pme_order, + pme->nthread, + pme->nkx % pme->nnodes_major != 0, + pme->nky % pme->nnodes_minor != 0); + + gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata, + &pme->fftgridB,&pme->cfftgridB, + pme->mpi_comm_d, + pme->overlap[0].s2g0,pme->overlap[1].s2g0, + bReproducible,pme->nthread); + } + else + { + pme->pmegridB.grid.grid = NULL; + pme->fftgridB = NULL; + pme->cfftgridB = NULL; + } + + if (!pme->bP3M) + { + /* Use plain SPME B-spline interpolation */ + make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order); + } + else + { + /* Use the P3M grid-optimized influence function */ + make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order); + } + + /* Use atc[0] for spreading */ + init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE); + if (pme->ndecompdim >= 2) + { + init_atomcomm(pme,&pme->atc[1],cr,1,FALSE); + } + + if (pme->nnodes == 1) { + pme->atc[0].n = homenr; + pme_realloc_atomcomm_things(&pme->atc[0]); + } + + { + int thread; + + /* Use fft5d, order after FFT is y major, z, x minor */ + + snew(pme->work,pme->nthread); + for(thread=0; threadnthread; thread++) + { + realloc_work(&pme->work[thread],pme->nkx); + } + } + + *pmedata = pme; + + return 0; +} + + +static void copy_local_grid(gmx_pme_t pme, + pmegrids_t *pmegrids,int thread,real *fftgrid) +{ + ivec local_fft_ndata,local_fft_offset,local_fft_size; + int fft_my,fft_mz; + int nsx,nsy,nsz; + ivec nf; + int offx,offy,offz,x,y,z,i0,i0t; + int d; + pmegrid_t *pmegrid; + real *grid_th; + + gmx_parallel_3dfft_real_limits(pme->pfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + fft_my = local_fft_size[YY]; + fft_mz = local_fft_size[ZZ]; + + pmegrid = &pmegrids->grid_th[thread]; + + nsx = pmegrid->n[XX]; + nsy = pmegrid->n[YY]; + nsz = pmegrid->n[ZZ]; + + for(d=0; dn[d] - (pmegrid->order - 1), + local_fft_ndata[d] - pmegrid->offset[d]); + } + + offx = pmegrid->offset[XX]; + offy = pmegrid->offset[YY]; + offz = pmegrid->offset[ZZ]; + + /* Directly copy the non-overlapping parts of the local grids. + * This also initializes the full grid. + */ + grid_th = pmegrid->grid; + for(x=0; xpfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + /* Major dimension */ + overlap = &pme->overlap[0]; + + nind = overlap->comm_data[0].send_nindex; + + for(y=0; ypfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + fft_nx = local_fft_ndata[XX]; + fft_ny = local_fft_ndata[YY]; + fft_nz = local_fft_ndata[ZZ]; + + fft_my = local_fft_size[YY]; + fft_mz = local_fft_size[ZZ]; + + /* This routine is called when all thread have finished spreading. + * Here each thread sums grid contributions calculated by other threads + * to the thread local grid volume. + * To minimize the number of grid copying operations, + * this routines sums immediately from the pmegrid to the fftgrid. + */ + + /* Determine which part of the full node grid we should operate on, + * this is our thread local part of the full grid. + */ + pmegrid = &pmegrids->grid_th[thread]; + + for(d=0; doffset[d]+pmegrid->n[d]-(pmegrid->order-1), + local_fft_ndata[d]); + } + + offx = pmegrid->offset[XX]; + offy = pmegrid->offset[YY]; + offz = pmegrid->offset[ZZ]; + + + bClearBufX = TRUE; + bClearBufY = TRUE; + bClearBufXY = TRUE; + + /* Now loop over all the thread data blocks that contribute + * to the grid region we (our thread) are operating on. + */ + /* Note that ffy_nx/y is equal to the number of grid points + * between the first point of our node grid and the one of the next node. + */ + for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--) + { + fx = pmegrid->ci[XX] + sx; + ox = 0; + bCommX = FALSE; + if (fx < 0) { + fx += pmegrids->nc[XX]; + ox -= fft_nx; + bCommX = (pme->nnodes_major > 1); + } + pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]]; + ox += pmegrid_g->offset[XX]; + if (!bCommX) + { + tx1 = min(ox + pmegrid_g->n[XX],ne[XX]); + } + else + { + tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order); + } + + for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--) + { + fy = pmegrid->ci[YY] + sy; + oy = 0; + bCommY = FALSE; + if (fy < 0) { + fy += pmegrids->nc[YY]; + oy -= fft_ny; + bCommY = (pme->nnodes_minor > 1); + } + pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]]; + oy += pmegrid_g->offset[YY]; + if (!bCommY) + { + ty1 = min(oy + pmegrid_g->n[YY],ne[YY]); + } + else + { + ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order); + } + + for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--) + { + fz = pmegrid->ci[ZZ] + sz; + oz = 0; + if (fz < 0) + { + fz += pmegrids->nc[ZZ]; + oz -= fft_nz; + } + pmegrid_g = &pmegrids->grid_th[fz]; + oz += pmegrid_g->offset[ZZ]; + tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]); + + if (sx == 0 && sy == 0 && sz == 0) + { + /* We have already added our local contribution + * before calling this routine, so skip it here. + */ + continue; + } + + thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz; + + pmegrid_f = &pmegrids->grid_th[thread_f]; + + grid_th = pmegrid_f->grid; + + nsx = pmegrid_f->n[XX]; + nsy = pmegrid_f->n[YY]; + nsz = pmegrid_f->n[ZZ]; + +#ifdef DEBUG_PME_REDUCE + printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n", + pme->nodeid,thread,thread_f, + pme->pmegrid_start_ix, + pme->pmegrid_start_iy, + pme->pmegrid_start_iz, + sx,sy,sz, + offx-ox,tx1-ox,offx,tx1, + offy-oy,ty1-oy,offy,ty1, + offz-oz,tz1-oz,offz,tz1); +#endif + + if (!(bCommX || bCommY)) + { + /* Copy from the thread local grid to the node grid */ + for(x=offx; xpfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + + /* Currently supports only a single communication pulse */ + +/* for(ipulse=0;ipulsenoverlap_nodes;ipulse++) */ + if (pme->nnodes_minor > 1) + { + /* Major dimension */ + overlap = &pme->overlap[1]; + + if (pme->nnodes_major > 1) + { + size_yx = pme->overlap[0].comm_data[0].send_nindex; + } + else + { + size_yx = 0; + } + datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ]; + + ipulse = 0; + + send_id = overlap->send_id[ipulse]; + recv_id = overlap->recv_id[ipulse]; + send_nindex = overlap->comm_data[ipulse].send_nindex; + /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */ + recv_index0 = 0; + recv_nindex = overlap->comm_data[ipulse].recv_nindex; + + sendptr = overlap->sendbuf; + recvptr = overlap->recvbuf; + + /* + printf("node %d comm %2d x %2d x %2d\n",pme->nodeid, + local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]); + printf("node %d send %f, %f\n",pme->nodeid, + sendptr[0],sendptr[send_nindex*datasize-1]); + */ + +#ifdef GMX_MPI + MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL, + send_id,ipulse, + recvptr,recv_nindex*datasize,GMX_MPI_REAL, + recv_id,ipulse, + overlap->mpi_comm,&stat); +#endif + + for(x=0; xnnodes_major > 1) + { + sendptr = pme->overlap[0].sendbuf; + for(x=0; xnoverlap_nodes;ipulse++) */ + if (pme->nnodes_major > 1) + { + /* Major dimension */ + overlap = &pme->overlap[0]; + + datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ]; + gridsize = local_fft_size[YY] *local_fft_size[ZZ]; + + ipulse = 0; + + send_id = overlap->send_id[ipulse]; + recv_id = overlap->recv_id[ipulse]; + send_nindex = overlap->comm_data[ipulse].send_nindex; + /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */ + recv_index0 = 0; + recv_nindex = overlap->comm_data[ipulse].recv_nindex; + + sendptr = overlap->sendbuf; + recvptr = overlap->recvbuf; + + if (debug != NULL) + { + fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n", + send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]); + } + +#ifdef GMX_MPI + MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL, + send_id,ipulse, + recvptr,recv_nindex*datasize,GMX_MPI_REAL, + recv_id,ipulse, + overlap->mpi_comm,&stat); +#endif + + for(x=0; xnthread; + assert(nthread>0); + +#ifdef PME_TIME_THREADS + c1 = omp_cyc_start(); +#endif + if (bCalcSplines) + { +#pragma omp parallel for num_threads(nthread) schedule(static) + for(thread=0; threadn* thread /nthread; + end = atc->n*(thread+1)/nthread; + + /* Compute fftgrid index for all atoms, + * with help of some extra variables. + */ + calc_interpolation_idx(pme,atc,start,end,thread); + } + } +#ifdef PME_TIME_THREADS + c1 = omp_cyc_end(c1); + cs1 += (double)c1; +#endif + +#ifdef PME_TIME_THREADS + c2 = omp_cyc_start(); +#endif +#pragma omp parallel for num_threads(nthread) schedule(static) + for(thread=0; threadnthread == 1) + { + spline = &atc->spline[0]; + + spline->n = atc->n; + + grid = &grids->grid; + } + else + { + spline = &atc->spline[thread]; + + make_thread_local_ind(atc,thread,spline); + + grid = &grids->grid_th[thread]; + } + + if (bCalcSplines) + { + make_bsplines(spline->theta,spline->dtheta,pme->pme_order, + atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP); + } + + if (bSpread) + { + /* put local atoms on grid. */ +#ifdef PME_TIME_SPREAD + ct1a = omp_cyc_start(); +#endif - spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work); ++ spread_q_bsplines_thread(grid,atc,spline,pme->spline_work); + + if (grids->nthread > 1) + { + copy_local_grid(pme,grids,thread,fftgrid); + } +#ifdef PME_TIME_SPREAD + ct1a = omp_cyc_end(ct1a); + cs1a[thread] += (double)ct1a; +#endif + } + } +#ifdef PME_TIME_THREADS + c2 = omp_cyc_end(c2); + cs2 += (double)c2; +#endif + + if (bSpread && grids->nthread > 1) + { +#ifdef PME_TIME_THREADS + c3 = omp_cyc_start(); +#endif +#pragma omp parallel for num_threads(grids->nthread) schedule(static) + for(thread=0; threadnthread; thread++) + { + reduce_threadgrid_overlap(pme,grids,thread, + fftgrid, + pme->overlap[0].sendbuf, + pme->overlap[1].sendbuf); +#ifdef PRINT_PME_SENDBUF + print_sendbuf(pme,pme->overlap[0].sendbuf); +#endif + } +#ifdef PME_TIME_THREADS + c3 = omp_cyc_end(c3); + cs3 += (double)c3; +#endif + + if (pme->nnodes > 1) + { + /* Communicate the overlapping part of the fftgrid */ + sum_fftgrid_dd(pme,fftgrid); + } + } + +#ifdef PME_TIME_THREADS + cnt++; + if (cnt % 20 == 0) + { + printf("idx %.2f spread %.2f red %.2f", + cs1*1e-9,cs2*1e-9,cs3*1e-9); +#ifdef PME_TIME_SPREAD + for(thread=0; threadpfft_setupA, + local_fft_ndata, + local_fft_offset, + local_fft_size); + + dump_grid(stderr, + pme->pmegrid_start_ix, + pme->pmegrid_start_iy, + pme->pmegrid_start_iz, + pme->pmegrid_nx-pme->pme_order+1, + pme->pmegrid_ny-pme->pme_order+1, + pme->pmegrid_nz-pme->pme_order+1, + local_fft_size[YY], + local_fft_size[ZZ], + fftgrid); +} + + +void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V) +{ + pme_atomcomm_t *atc; + pmegrids_t *grid; + + if (pme->nnodes > 1) + { + gmx_incons("gmx_pme_calc_energy called in parallel"); + } + if (pme->bFEP > 1) + { + gmx_incons("gmx_pme_calc_energy with free energy"); + } + + atc = &pme->atc_energy; + atc->nthread = 1; + if (atc->spline == NULL) + { + snew(atc->spline,atc->nthread); + } + atc->nslab = 1; + atc->bSpread = TRUE; + atc->pme_order = pme->pme_order; + atc->n = n; + pme_realloc_atomcomm_things(atc); + atc->x = x; + atc->q = q; + + /* We only use the A-charges grid */ + grid = &pme->pmegridA; + + spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA); + + *V = gather_energy_bsplines(pme,grid->grid.grid,atc); +} + + +static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle, + t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel) +{ + /* Reset all the counters related to performance over the run */ + wallcycle_stop(wcycle,ewcRUN); + wallcycle_reset_all(wcycle); + init_nrnb(nrnb); + ir->init_step += step_rel; + ir->nsteps -= step_rel; + wallcycle_start(wcycle,ewcRUN); +} + + +int gmx_pmeonly(gmx_pme_t pme, + t_commrec *cr, t_nrnb *nrnb, + gmx_wallcycle_t wcycle, + real ewaldcoeff, gmx_bool bGatherOnly, + t_inputrec *ir) +{ + gmx_pme_pp_t pme_pp; + int natoms; + matrix box; + rvec *x_pp=NULL,*f_pp=NULL; + real *chargeA=NULL,*chargeB=NULL; + real lambda=0; + int maxshift_x=0,maxshift_y=0; + real energy,dvdlambda; + matrix vir; + float cycles; + int count; + gmx_bool bEnerVir; + gmx_large_int_t step,step_rel; + + + pme_pp = gmx_pme_pp_init(cr); + + init_nrnb(nrnb); + + count = 0; + do /****** this is a quasi-loop over time steps! */ + { + /* Domain decomposition */ + natoms = gmx_pme_recv_q_x(pme_pp, + &chargeA,&chargeB,box,&x_pp,&f_pp, + &maxshift_x,&maxshift_y, + &pme->bFEP,&lambda, + &bEnerVir, + &step); + + if (natoms == -1) { + /* We should stop: break out of the loop */ + break; + } + + step_rel = step - ir->init_step; + + if (count == 0) + wallcycle_start(wcycle,ewcRUN); + + wallcycle_start(wcycle,ewcPMEMESH); + + dvdlambda = 0; + clear_mat(vir); + gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box, + cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff, + &energy,lambda,&dvdlambda, + GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0)); + + cycles = wallcycle_stop(wcycle,ewcPMEMESH); + + gmx_pme_send_force_vir_ener(pme_pp, + f_pp,vir,energy,dvdlambda, + cycles); + + count++; + + if (step_rel == wcycle_get_reset_counters(wcycle)) + { + /* Reset all the counters related to performance over the run */ + reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel); + wcycle_set_reset_counters(wcycle, 0); + } + + } /***** end of quasi-loop, we stop with the break above */ + while (TRUE); + + return 0; +} + +int gmx_pme_do(gmx_pme_t pme, + int start, int homenr, + rvec x[], rvec f[], + real *chargeA, real *chargeB, + matrix box, t_commrec *cr, + int maxshift_x, int maxshift_y, + t_nrnb *nrnb, gmx_wallcycle_t wcycle, + matrix vir, real ewaldcoeff, + real *energy, real lambda, + real *dvdlambda, int flags) +{ + int q,d,i,j,ntot,npme; + int nx,ny,nz; + int n_d,local_ny; + pme_atomcomm_t *atc=NULL; + pmegrids_t *pmegrid=NULL; + real *grid=NULL; + real *ptr; + rvec *x_d,*f_d; + real *charge=NULL,*q_d; + real energy_AB[2]; + matrix vir_AB[2]; + gmx_bool bClearF; + gmx_parallel_3dfft_t pfft_setup; + real * fftgrid; + t_complex * cfftgrid; + int thread; + const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR; + const gmx_bool bCalcF = flags & GMX_PME_CALC_F; + + assert(pme->nnodes > 0); + assert(pme->nnodes == 1 || pme->ndecompdim > 0); + + if (pme->nnodes > 1) { + atc = &pme->atc[0]; + atc->npd = homenr; + if (atc->npd > atc->pd_nalloc) { + atc->pd_nalloc = over_alloc_dd(atc->npd); + srenew(atc->pd,atc->pd_nalloc); + } + atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y); + } + else + { + /* This could be necessary for TPI */ + pme->atc[0].n = homenr; + } + + for(q=0; q<(pme->bFEP ? 2 : 1); q++) { + if (q == 0) { + pmegrid = &pme->pmegridA; + fftgrid = pme->fftgridA; + cfftgrid = pme->cfftgridA; + pfft_setup = pme->pfft_setupA; + charge = chargeA+start; + } else { + pmegrid = &pme->pmegridB; + fftgrid = pme->fftgridB; + cfftgrid = pme->cfftgridB; + pfft_setup = pme->pfft_setupB; + charge = chargeB+start; + } + grid = pmegrid->grid.grid; + /* Unpack structure */ + if (debug) { + fprintf(debug,"PME: nnodes = %d, nodeid = %d\n", + cr->nnodes,cr->nodeid); + fprintf(debug,"Grid = %p\n",(void*)grid); + if (grid == NULL) + gmx_fatal(FARGS,"No grid!"); + } + where(); + + m_inv_ur0(box,pme->recipbox); + + if (pme->nnodes == 1) { + atc = &pme->atc[0]; + if (DOMAINDECOMP(cr)) { + atc->n = homenr; + pme_realloc_atomcomm_things(atc); + } + atc->x = x; + atc->q = charge; + atc->f = f; + } else { + wallcycle_start(wcycle,ewcPME_REDISTXF); + for(d=pme->ndecompdim-1; d>=0; d--) + { + if (d == pme->ndecompdim-1) + { + n_d = homenr; + x_d = x + start; + q_d = charge; + } + else + { + n_d = pme->atc[d+1].n; + x_d = atc->x; + q_d = atc->q; + } + atc = &pme->atc[d]; + atc->npd = n_d; + if (atc->npd > atc->pd_nalloc) { + atc->pd_nalloc = over_alloc_dd(atc->npd); + srenew(atc->pd,atc->pd_nalloc); + } + atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y); + pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc); + where(); + + /* Redistribute x (only once) and qA or qB */ + if (DOMAINDECOMP(cr)) { + dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc); + } else { + pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc); + } + } + where(); + + wallcycle_stop(wcycle,ewcPME_REDISTXF); + } + + if (debug) + fprintf(debug,"Node= %6d, pme local particles=%6d\n", + cr->nodeid,atc->n); + + if (flags & GMX_PME_SPREAD_Q) + { + wallcycle_start(wcycle,ewcPME_SPREADGATHER); + + /* Spread the charges on a grid */ + spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid); + + if (q == 0) + { + inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n); + } + inc_nrnb(nrnb,eNR_SPREADQBSP, + pme->pme_order*pme->pme_order*pme->pme_order*atc->n); + + if (pme->nthread == 1) + { + wrap_periodic_pmegrid(pme,grid); + + /* sum contributions to local grid from other nodes */ +#ifdef GMX_MPI + if (pme->nnodes > 1) + { + gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD); + where(); + } +#endif + + copy_pmegrid_to_fftgrid(pme,grid,fftgrid); + } + + wallcycle_stop(wcycle,ewcPME_SPREADGATHER); + + /* + dump_local_fftgrid(pme,fftgrid); + exit(0); + */ + } + + /* Here we start a large thread parallel region */ +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for(thread=0; threadnthread; thread++) + { + if (flags & GMX_PME_SOLVE) + { + int loop_count; + + /* do 3d-fft */ + if (thread == 0) + { + wallcycle_start(wcycle,ewcPME_FFT); + } + gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX, + fftgrid,cfftgrid,thread,wcycle); + if (thread == 0) + { + wallcycle_stop(wcycle,ewcPME_FFT); + } + where(); + + /* solve in k-space for our local cells */ + if (thread == 0) + { + wallcycle_start(wcycle,ewcPME_SOLVE); + } + loop_count = + solve_pme_yzx(pme,cfftgrid,ewaldcoeff, + box[XX][XX]*box[YY][YY]*box[ZZ][ZZ], + bCalcEnerVir, + pme->nthread,thread); + if (thread == 0) + { + wallcycle_stop(wcycle,ewcPME_SOLVE); + where(); + inc_nrnb(nrnb,eNR_SOLVEPME,loop_count); + } + } + + if (bCalcF) + { + /* do 3d-invfft */ + if (thread == 0) + { + where(); + wallcycle_start(wcycle,ewcPME_FFT); + } + gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL, + cfftgrid,fftgrid,thread,wcycle); + if (thread == 0) + { + wallcycle_stop(wcycle,ewcPME_FFT); + + where(); + + if (pme->nodeid == 0) + { + ntot = pme->nkx*pme->nky*pme->nkz; + npme = ntot*log((real)ntot)/log(2.0); + inc_nrnb(nrnb,eNR_FFT,2*npme); + } + + wallcycle_start(wcycle,ewcPME_SPREADGATHER); + } + + copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread); + } + } + /* End of thread parallel section. + * With MPI we have to synchronize here before gmx_sum_qgrid_dd. + */ + + if (bCalcF) + { + /* distribute local grid to all nodes */ +#ifdef GMX_MPI + if (pme->nnodes > 1) { + gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD); + } +#endif + where(); + + unwrap_periodic_pmegrid(pme,grid); + + /* interpolate forces for our local atoms */ + + where(); + + /* If we are running without parallelization, + * atc->f is the actual force array, not a buffer, + * therefore we should not clear it. + */ + bClearF = (q == 0 && PAR(cr)); +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for(thread=0; threadnthread; thread++) + { + gather_f_bsplines(pme,grid,bClearF,atc, + &atc->spline[thread], + pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0); + } + + where(); + + inc_nrnb(nrnb,eNR_GATHERFBSP, + pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n); + wallcycle_stop(wcycle,ewcPME_SPREADGATHER); + } + + if (bCalcEnerVir) + { + /* This should only be called on the master thread + * and after the threads have synchronized. + */ + get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]); + } + } /* of q-loop */ + + if (bCalcF && pme->nnodes > 1) { + wallcycle_start(wcycle,ewcPME_REDISTXF); + for(d=0; dndecompdim; d++) + { + atc = &pme->atc[d]; + if (d == pme->ndecompdim - 1) + { + n_d = homenr; + f_d = f + start; + } + else + { + n_d = pme->atc[d+1].n; + f_d = pme->atc[d+1].f; + } + if (DOMAINDECOMP(cr)) { + dd_pmeredist_f(pme,atc,n_d,f_d, + d==pme->ndecompdim-1 && pme->bPPnode); + } else { + pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc); + } + } + + wallcycle_stop(wcycle,ewcPME_REDISTXF); + } + where(); + + if (bCalcEnerVir) + { + if (!pme->bFEP) { + *energy = energy_AB[0]; + m_add(vir,vir_AB[0],vir); + } else { + *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1]; + *dvdlambda += energy_AB[1] - energy_AB[0]; + for(i=0; i