From: Roland Schulz Date: Thu, 27 Jun 2013 07:57:35 +0000 (-0400) Subject: Merge branch 'release-4-6' into master X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=cc6daae88401ad68b564de821490da31d49331c2;p=alexxy%2Fgromacs.git Merge branch 'release-4-6' into master Conflicts: CMakeLists.txt cmake/ThreadMPI.cmake src/gromacs/gmxana/calcpot.c src/gromacs/gmxana/calcpot.h src/gromacs/legacyheaders/pull_rotation.h src/tools/CMakeLists.txt Resolution was straightforward; always in favour of version already in master branch. Removed calcpot.[ch]. Change-Id: I7ad7a6d9e34f30e04f71c52d707065c6e14b68f3 --- cc6daae88401ad68b564de821490da31d49331c2 diff --cc src/gromacs/gmxana/gmx_bar.c index 45ae72b224,0000000000..2fa8ecbde3 mode 100644,000000..100644 --- a/src/gromacs/gmxana/gmx_bar.c +++ b/src/gromacs/gmxana/gmx_bar.c @@@ -1,3953 -1,0 +1,3950 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Green Red Orange Magenta Azure Cyan Skyblue + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include + +#include "sysstuff.h" +#include "typedefs.h" +#include "smalloc.h" +#include "futil.h" +#include "statutil.h" +#include "copyrite.h" +#include "macros.h" +#include "enxio.h" +#include "physics.h" +#include "gmx_fatal.h" +#include "xvgr.h" +#include "gmx_ana.h" +#include "maths.h" +#include "string2.h" +#include "names.h" +#include "mdebin.h" + + +/* Structure for the names of lambda vector components */ +typedef struct lambda_components_t +{ + char **names; /* Array of strings with names for the lambda vector + components */ + int N; /* The number of components */ + int Nalloc; /* The number of allocated components */ +} lambda_components_t; + +/* Structure for a lambda vector or a dhdl derivative direction */ +typedef struct lambda_vec_t +{ + double *val; /* The lambda vector component values. Only valid if + dhdl == -1 */ + int dhdl; /* The coordinate index for the derivative described by this + structure, or -1 */ + const lambda_components_t *lc; /* the associated lambda_components + structure */ + int index; /* The state number (init-lambda-state) of this lambda + vector, if known. If not, it is set to -1 */ +} lambda_vec_t; + +/* the dhdl.xvg data from a simulation */ +typedef struct xvg_t +{ + const char *filename; + int ftp; /* file type */ + int nset; /* number of lambdas, including dhdl */ + int *np; /* number of data points (du or hists) per lambda */ + int np_alloc; /* number of points (du or hists) allocated */ + double temp; /* temperature */ + lambda_vec_t *lambda; /* the lambdas (of first index for y). */ + double *t; /* the times (of second index for y) */ + double **y; /* the dU values. y[0] holds the derivative, while + further ones contain the energy differences between + the native lambda and the 'foreign' lambdas. */ + lambda_vec_t native_lambda; /* the native lambda */ + + struct xvg_t *next, *prev; /*location in the global linked list of xvg_ts*/ +} xvg_t; + + +typedef struct hist_t +{ + unsigned int *bin[2]; /* the (forward + reverse) histogram values */ + double dx[2]; /* the histogram spacing. The reverse + dx is the negative of the forward dx.*/ + gmx_large_int_t x0[2]; /* the (forward + reverse) histogram start + point(s) as int */ + + int nbin[2]; /* the (forward+reverse) number of bins */ + gmx_large_int_t sum; /* the total number of counts. Must be + the same for forward + reverse. */ + int nhist; /* number of hist datas (forward or reverse) */ + + double start_time, delta_time; /* start time, end time of histogram */ +} hist_t; + + +/* an aggregate of samples for partial free energy calculation */ +typedef struct samples_t +{ + lambda_vec_t *native_lambda; /* pointer to native lambda vector */ + lambda_vec_t *foreign_lambda; /* pointer to foreign lambda vector */ + double temp; /* the temperature */ + gmx_bool derivative; /* whether this sample is a derivative */ + + /* The samples come either as either delta U lists: */ + int ndu; /* the number of delta U samples */ + double *du; /* the delta u's */ + double *t; /* the times associated with those samples, or: */ + double start_time, delta_time; /*start time and delta time for linear time*/ + + /* or as histograms: */ + hist_t *hist; /* a histogram */ + + /* allocation data: (not NULL for data 'owned' by this struct) */ + double *du_alloc, *t_alloc; /* allocated delta u arrays */ + size_t ndu_alloc, nt_alloc; /* pre-allocated sizes */ + hist_t *hist_alloc; /* allocated hist */ + + gmx_large_int_t ntot; /* total number of samples */ + const char *filename; /* the file name this sample comes from */ +} samples_t; + +/* a sample range (start to end for du-style data, or boolean + for both du-style data and histograms */ +typedef struct sample_range_t +{ + int start, end; /* start and end index for du style data */ + gmx_bool use; /* whether to use this sample */ + + samples_t *s; /* the samples this range belongs to */ +} sample_range_t; + + +/* a collection of samples for a partial free energy calculation + (i.e. the collection of samples from one native lambda to one + foreign lambda) */ +typedef struct sample_coll_t +{ + lambda_vec_t *native_lambda; /* these should be the same for all samples + in the histogram */ + lambda_vec_t *foreign_lambda; /* collection */ + double temp; /* the temperature */ + + int nsamples; /* the number of samples */ + samples_t **s; /* the samples themselves */ + sample_range_t *r; /* the sample ranges */ + int nsamples_alloc; /* number of allocated samples */ + + gmx_large_int_t ntot; /* total number of samples in the ranges of + this collection */ + + struct sample_coll_t *next, *prev; /* next and previous in the list */ +} sample_coll_t; + +/* all the samples associated with a lambda point */ +typedef struct lambda_data_t +{ + lambda_vec_t *lambda; /* the native lambda (at start time if dynamic) */ + double temp; /* temperature */ + + sample_coll_t *sc; /* the samples */ + + sample_coll_t sc_head; /*the pre-allocated list head for the linked list.*/ + + struct lambda_data_t *next, *prev; /* the next and prev in the list */ +} lambda_data_t; + +/* Top-level data structure of simulation data */ +typedef struct sim_data_t +{ + lambda_data_t *lb; /* a lambda data linked list */ + lambda_data_t lb_head; /* The head element of the linked list */ + + lambda_components_t lc; /* the allowed components of the lambda + vectors */ +} sim_data_t; + +/* Top-level data structure with calculated values. */ +typedef struct { + sample_coll_t *a, *b; /* the simulation data */ + + double dg; /* the free energy difference */ + double dg_err; /* the free energy difference */ + + double dg_disc_err; /* discretization error */ + double dg_histrange_err; /* histogram range error */ + + double sa; /* relative entropy of b in state a */ + double sa_err; /* error in sa */ + double sb; /* relative entropy of a in state b */ + double sb_err; /* error in sb */ + + double dg_stddev; /* expected dg stddev per sample */ + double dg_stddev_err; /* error in dg_stddev */ +} barres_t; + + +/* Initialize a lambda_components structure */ +static void lambda_components_init(lambda_components_t *lc) +{ + lc->N = 0; + lc->Nalloc = 2; + snew(lc->names, lc->Nalloc); +} + +/* Add a component to a lambda_components structure */ +static void lambda_components_add(lambda_components_t *lc, + const char *name, size_t name_length) +{ + while (lc->N + 1 > lc->Nalloc) + { + lc->Nalloc = (lc->Nalloc == 0) ? 2 : 2*lc->Nalloc; + srealloc( lc->names, lc->Nalloc ); + } + snew(lc->names[lc->N], name_length+1); + strncpy(lc->names[lc->N], name, name_length); + lc->N++; +} + +/* check whether a component with index 'index' matches the given name, or + is also NULL. Returns TRUE if this is the case. + the string name does not need to end */ +static gmx_bool lambda_components_check(const lambda_components_t *lc, + int index, + const char *name, + size_t name_length) +{ + size_t len; + if (index >= lc->N) + { + return FALSE; + } + if (name == NULL && lc->names[index] == NULL) + { + return TRUE; + } + if ((name == NULL) != (lc->names[index] == NULL)) + { + return FALSE; + } + len = strlen(lc->names[index]); + if (len != name_length) + { + return FALSE; + } + if (strncmp(lc->names[index], name, name_length) == 0) + { + return TRUE; + } + return FALSE; +} + +/* Find the index of a given lambda component name, or -1 if not found */ +static int lambda_components_find(const lambda_components_t *lc, + const char *name, + size_t name_length) +{ + int i; + + for (i = 0; i < lc->N; i++) + { + if (strncmp(lc->names[i], name, name_length) == 0) + { + return i; + } + } + return -1; +} + + + +/* initialize a lambda vector */ +static void lambda_vec_init(lambda_vec_t *lv, const lambda_components_t *lc) +{ + snew(lv->val, lc->N); + lv->index = -1; + lv->dhdl = -1; + lv->lc = lc; +} + +static void lambda_vec_destroy(lambda_vec_t *lv) +{ + sfree(lv->val); +} + +static void lambda_vec_copy(lambda_vec_t *lv, const lambda_vec_t *orig) +{ + int i; + + lambda_vec_init(lv, orig->lc); + lv->dhdl = orig->dhdl; + lv->index = orig->index; + for (i = 0; i < lv->lc->N; i++) + { + lv->val[i] = orig->val[i]; + } +} + +/* write a lambda vec to a preallocated string */ +static void lambda_vec_print(const lambda_vec_t *lv, char *str, gmx_bool named) +{ + int i; + size_t np; + + str[0] = 0; /* reset the string */ + if (lv->dhdl < 0) + { + if (named) + { + str += sprintf(str, "delta H to "); + } + if (lv->lc->N > 1) + { + str += sprintf(str, "("); + } + for (i = 0; i < lv->lc->N; i++) + { + str += sprintf(str, "%g", lv->val[i]); + if (i < lv->lc->N-1) + { + str += sprintf(str, ", "); + } + } + if (lv->lc->N > 1) + { + str += sprintf(str, ")"); + } + } + else + { + /* this lambda vector describes a derivative */ + str += sprintf(str, "dH/dl"); + if (strlen(lv->lc->names[lv->dhdl]) > 0) + { + str += sprintf(str, " (%s)", lv->lc->names[lv->dhdl]); + } + } +} + +/* write a shortened version of the lambda vec to a preallocated string */ +static void lambda_vec_print_short(const lambda_vec_t *lv, char *str) +{ + int i; + size_t np; + + if (lv->index >= 0) + { + sprintf(str, "%6d", lv->index); + } + else + { + if (lv->dhdl < 0) + { + sprintf(str, "%6.3f", lv->val[0]); + } + else + { + sprintf(str, "dH/dl[%d]", lv->dhdl); + } + } +} + +/* write an intermediate version of two lambda vecs to a preallocated string */ +static void lambda_vec_print_intermediate(const lambda_vec_t *a, + const lambda_vec_t *b, char *str) +{ + int i; + size_t np; + + str[0] = 0; + if ( (a->index >= 0) && (b->index >= 0) ) + { + sprintf(str, "%6.3f", ((double)a->index+(double)b->index)/2.); + } + else + { + if ( (a->dhdl < 0) && (b->dhdl < 0) ) + { + sprintf(str, "%6.3f", (a->val[0]+b->val[0])/2.); + } + } +} + + + +/* calculate the difference in lambda vectors: c = a-b. + c must be initialized already, and a and b must describe non-derivative + lambda points */ +static void lambda_vec_diff(const lambda_vec_t *a, const lambda_vec_t *b, + lambda_vec_t *c) +{ + int i; + + if ( (a->dhdl > 0) || (b->dhdl > 0) ) + { + gmx_fatal(FARGS, + "Trying to calculate the difference between derivatives instead of lambda points"); + } + if ((a->lc != b->lc) || (a->lc != c->lc) ) + { + gmx_fatal(FARGS, + "Trying to calculate the difference lambdas with differing basis set"); + } + for (i = 0; i < a->lc->N; i++) + { + c->val[i] = a->val[i] - b->val[i]; + } +} + +/* calculate and return the absolute difference in lambda vectors: c = |a-b|. + a and b must describe non-derivative lambda points */ +static double lambda_vec_abs_diff(const lambda_vec_t *a, const lambda_vec_t *b) +{ + int i; + double ret = 0.; + + if ( (a->dhdl > 0) || (b->dhdl > 0) ) + { + gmx_fatal(FARGS, + "Trying to calculate the difference between derivatives instead of lambda points"); + } + if (a->lc != b->lc) + { + gmx_fatal(FARGS, + "Trying to calculate the difference lambdas with differing basis set"); + } + for (i = 0; i < a->lc->N; i++) + { + double df = a->val[i] - b->val[i]; + ret += df*df; + } + return sqrt(ret); +} + + +/* check whether two lambda vectors are the same */ +static gmx_bool lambda_vec_same(const lambda_vec_t *a, const lambda_vec_t *b) +{ + int i; + + if (a->lc != b->lc) + { + return FALSE; + } + if (a->dhdl < 0) + { + for (i = 0; i < a->lc->N; i++) + { + if (!gmx_within_tol(a->val[i], b->val[i], 10*GMX_REAL_EPS)) + { + return FALSE; + } + } + return TRUE; + } + else + { + /* they're derivatives, so we check whether the indices match */ + return (a->dhdl == b->dhdl); + } +} + +/* Compare the sort order of two foreign lambda vectors + + returns 1 if a is 'bigger' than b, + returns 0 if they're the same, + returns -1 if a is 'smaller' than b.*/ +static gmx_bool lambda_vec_cmp_foreign(const lambda_vec_t *a, + const lambda_vec_t *b) +{ + int i; + double norm_a = 0, norm_b = 0; + gmx_bool different = FALSE; + + if (a->lc != b->lc) + { + gmx_fatal(FARGS, "Can't compare lambdas with differing basis sets"); + } + /* if either one has an index we sort based on that */ + if ((a->index >= 0) || (b->index >= 0)) + { + if (a->index == b->index) + { + return 0; + } + return (a->index > b->index) ? 1 : -1; + } + if (a->dhdl >= 0 || b->dhdl >= 0) + { + /* lambda vectors that are derivatives always sort higher than those + without derivatives */ + if ((a->dhdl >= 0) != (b->dhdl >= 0) ) + { + return (a->dhdl >= 0) ? 1 : -1; + } + return a->dhdl > b->dhdl; + } + + /* neither has an index, so we can only sort on the lambda components, + which is only valid if there is one component */ + for (i = 0; i < a->lc->N; i++) + { + if (!gmx_within_tol(a->val[i], b->val[i], 10*GMX_REAL_EPS)) + { + different = TRUE; + } + norm_a += a->val[i]*a->val[i]; + norm_b += b->val[i]*b->val[i]; + } + if (!different) + { + return 0; + } + return norm_a > norm_b; +} + +/* Compare the sort order of two native lambda vectors + + returns 1 if a is 'bigger' than b, + returns 0 if they're the same, + returns -1 if a is 'smaller' than b.*/ +static gmx_bool lambda_vec_cmp_native(const lambda_vec_t *a, + const lambda_vec_t *b) +{ + int i; + + if (a->lc != b->lc) + { + gmx_fatal(FARGS, "Can't compare lambdas with differing basis sets"); + } + /* if either one has an index we sort based on that */ + if ((a->index >= 0) || (b->index >= 0)) + { + if (a->index == b->index) + { + return 0; + } + return (a->index > b->index) ? 1 : -1; + } + /* neither has an index, so we can only sort on the lambda components, + which is only valid if there is one component */ + if (a->lc->N > 1) + { + gmx_fatal(FARGS, + "Can't compare lambdas with no index and > 1 component"); + } + if (a->dhdl >= 0 || b->dhdl >= 0) + { + gmx_fatal(FARGS, + "Can't compare native lambdas that are derivatives"); + } + if (gmx_within_tol(a->val[0], b->val[0], 10*GMX_REAL_EPS)) + { + return 0; + } + return a->val[0] > b->val[0] ? 1 : -1; +} + + + + +static void hist_init(hist_t *h, int nhist, int *nbin) +{ + int i; + if (nhist > 2) + { + gmx_fatal(FARGS, "histogram with more than two sets of data!"); + } + for (i = 0; i < nhist; i++) + { + snew(h->bin[i], nbin[i]); + h->x0[i] = 0; + h->nbin[i] = nbin[i]; + h->start_time = h->delta_time = 0; + h->dx[i] = 0; + } + h->sum = 0; + h->nhist = nhist; +} + +static void hist_destroy(hist_t *h) +{ + sfree(h->bin); +} + + +static void xvg_init(xvg_t *ba) +{ + ba->filename = NULL; + ba->nset = 0; + ba->np_alloc = 0; + ba->np = NULL; + ba->y = NULL; +} + +static void samples_init(samples_t *s, lambda_vec_t *native_lambda, + lambda_vec_t *foreign_lambda, double temp, + gmx_bool derivative, const char *filename) +{ + s->native_lambda = native_lambda; + s->foreign_lambda = foreign_lambda; + s->temp = temp; + s->derivative = derivative; + + s->ndu = 0; + s->du = NULL; + s->t = NULL; + s->start_time = s->delta_time = 0; + s->hist = NULL; + s->du_alloc = NULL; + s->t_alloc = NULL; + s->hist_alloc = NULL; + s->ndu_alloc = 0; + s->nt_alloc = 0; + + s->ntot = 0; + s->filename = filename; +} + +static void sample_range_init(sample_range_t *r, samples_t *s) +{ + r->start = 0; + r->end = s->ndu; + r->use = TRUE; + r->s = NULL; +} + +static void sample_coll_init(sample_coll_t *sc, lambda_vec_t *native_lambda, + lambda_vec_t *foreign_lambda, double temp) +{ + sc->native_lambda = native_lambda; + sc->foreign_lambda = foreign_lambda; + sc->temp = temp; + + sc->nsamples = 0; + sc->s = NULL; + sc->r = NULL; + sc->nsamples_alloc = 0; + + sc->ntot = 0; + sc->next = sc->prev = NULL; +} + +static void sample_coll_destroy(sample_coll_t *sc) +{ + /* don't free the samples themselves */ + sfree(sc->r); + sfree(sc->s); +} + + +static void lambda_data_init(lambda_data_t *l, lambda_vec_t *native_lambda, + double temp) +{ + l->lambda = native_lambda; + l->temp = temp; + + l->next = NULL; + l->prev = NULL; + + l->sc = &(l->sc_head); + + sample_coll_init(l->sc, native_lambda, NULL, 0.); + l->sc->next = l->sc; + l->sc->prev = l->sc; +} + +static void barres_init(barres_t *br) +{ + br->dg = 0; + br->dg_err = 0; + br->sa = 0; + br->sa_err = 0; + br->sb = 0; + br->sb_err = 0; + br->dg_stddev = 0; + br->dg_stddev_err = 0; + + br->a = NULL; + br->b = NULL; +} + + +/* calculate the total number of samples in a sample collection */ +static void sample_coll_calc_ntot(sample_coll_t *sc) +{ + int i; + + sc->ntot = 0; + for (i = 0; i < sc->nsamples; i++) + { + if (sc->r[i].use) + { + if (sc->s[i]->hist) + { + sc->ntot += sc->s[i]->ntot; + } + else + { + sc->ntot += sc->r[i].end - sc->r[i].start; + } + } + } +} + + +/* find the barsamples_t associated with a lambda that corresponds to + a specific foreign lambda */ +static sample_coll_t *lambda_data_find_sample_coll(lambda_data_t *l, + lambda_vec_t *foreign_lambda) +{ + sample_coll_t *sc = l->sc->next; + + while (sc != l->sc) + { + if (lambda_vec_same(sc->foreign_lambda, foreign_lambda)) + { + return sc; + } + sc = sc->next; + } + + return NULL; +} + +/* insert li into an ordered list of lambda_colls */ +static void lambda_data_insert_sample_coll(lambda_data_t *l, sample_coll_t *sc) +{ + sample_coll_t *scn = l->sc->next; + while ( (scn != l->sc) ) + { + if (lambda_vec_cmp_foreign(scn->foreign_lambda, sc->foreign_lambda) > 0) + { + break; + } + scn = scn->next; + } + /* now insert it before the found scn */ + sc->next = scn; + sc->prev = scn->prev; + scn->prev->next = sc; + scn->prev = sc; +} + +/* insert li into an ordered list of lambdas */ +static void lambda_data_insert_lambda(lambda_data_t *head, lambda_data_t *li) +{ + lambda_data_t *lc = head->next; + while (lc != head) + { + if (lambda_vec_cmp_native(lc->lambda, li->lambda) > 0) + { + break; + } + lc = lc->next; + } + /* now insert ourselves before the found lc */ + li->next = lc; + li->prev = lc->prev; + lc->prev->next = li; + lc->prev = li; +} + +/* insert a sample and a sample_range into a sample_coll. The + samples are stored as a pointer, the range is copied. */ +static void sample_coll_insert_sample(sample_coll_t *sc, samples_t *s, + sample_range_t *r) +{ + /* first check if it belongs here */ + if (sc->temp != s->temp) + { + gmx_fatal(FARGS, "Temperatures in files %s and %s are not the same!", + s->filename, sc->next->s[0]->filename); + } + if (!lambda_vec_same(sc->native_lambda, s->native_lambda)) + { + gmx_fatal(FARGS, "Native lambda in files %s and %s are not the same (and they should be)!", + s->filename, sc->next->s[0]->filename); + } + if (!lambda_vec_same(sc->foreign_lambda, s->foreign_lambda)) + { + gmx_fatal(FARGS, "Foreign lambda in files %s and %s are not the same (and they should be)!", + s->filename, sc->next->s[0]->filename); + } + + /* check if there's room */ + if ( (sc->nsamples + 1) > sc->nsamples_alloc) + { + sc->nsamples_alloc = max(2*sc->nsamples_alloc, 2); + srenew(sc->s, sc->nsamples_alloc); + srenew(sc->r, sc->nsamples_alloc); + } + sc->s[sc->nsamples] = s; + sc->r[sc->nsamples] = *r; + sc->nsamples++; + + sample_coll_calc_ntot(sc); +} + +/* insert a sample into a lambda_list, creating the right sample_coll if + neccesary */ +static void lambda_data_list_insert_sample(lambda_data_t *head, samples_t *s) +{ + gmx_bool found = FALSE; + sample_coll_t *sc; + sample_range_t r; + + lambda_data_t *l = head->next; + + /* first search for the right lambda_data_t */ + while (l != head) + { + if (lambda_vec_same(l->lambda, s->native_lambda) ) + { + found = TRUE; + break; + } + l = l->next; + } + + if (!found) + { + snew(l, 1); /* allocate a new one */ + lambda_data_init(l, s->native_lambda, s->temp); /* initialize it */ + lambda_data_insert_lambda(head, l); /* add it to the list */ + } + + /* now look for a sample collection */ + sc = lambda_data_find_sample_coll(l, s->foreign_lambda); + if (!sc) + { + snew(sc, 1); /* allocate a new one */ + sample_coll_init(sc, s->native_lambda, s->foreign_lambda, s->temp); + lambda_data_insert_sample_coll(l, sc); + } + + /* now insert the samples into the sample coll */ + sample_range_init(&r, s); + sample_coll_insert_sample(sc, s, &r); +} + + +/* make a histogram out of a sample collection */ +static void sample_coll_make_hist(sample_coll_t *sc, int **bin, + int *nbin_alloc, int *nbin, + double *dx, double *xmin, int nbin_default) +{ + int i, j, k; + gmx_bool dx_set = FALSE; + gmx_bool xmin_set = FALSE; + + gmx_bool xmax_set = FALSE; + gmx_bool xmax_set_hard = FALSE; /* whether the xmax is bounded by the + limits of a histogram */ + double xmax = -1; + + /* first determine dx and xmin; try the histograms */ + for (i = 0; i < sc->nsamples; i++) + { + if (sc->s[i]->hist) + { + hist_t *hist = sc->s[i]->hist; + for (k = 0; k < hist->nhist; k++) + { + double hdx = hist->dx[k]; + double xmax_now = (hist->x0[k]+hist->nbin[k])*hdx; + + /* we use the biggest dx*/ + if ( (!dx_set) || hist->dx[0] > *dx) + { + dx_set = TRUE; + *dx = hist->dx[0]; + } + if ( (!xmin_set) || (hist->x0[k]*hdx) < *xmin) + { + xmin_set = TRUE; + *xmin = (hist->x0[k]*hdx); + } + + if ( (!xmax_set) || (xmax_now > xmax && !xmax_set_hard) ) + { + xmax_set = TRUE; + xmax = xmax_now; + if (hist->bin[k][hist->nbin[k]-1] != 0) + { + xmax_set_hard = TRUE; + } + } + if (hist->bin[k][hist->nbin[k]-1] != 0 && (xmax_now < xmax) ) + { + xmax_set_hard = TRUE; + xmax = xmax_now; + } + } + } + } + /* and the delta us */ + for (i = 0; i < sc->nsamples; i++) + { + if (sc->s[i]->ndu > 0) + { + /* determine min and max */ + int starti = sc->r[i].start; + int endi = sc->r[i].end; + double du_xmin = sc->s[i]->du[starti]; + double du_xmax = sc->s[i]->du[starti]; + for (j = starti+1; j < endi; j++) + { + if (sc->s[i]->du[j] < du_xmin) + { + du_xmin = sc->s[i]->du[j]; + } + if (sc->s[i]->du[j] > du_xmax) + { + du_xmax = sc->s[i]->du[j]; + } + } + + /* and now change the limits */ + if ( (!xmin_set) || (du_xmin < *xmin) ) + { + xmin_set = TRUE; + *xmin = du_xmin; + } + if ( (!xmax_set) || ((du_xmax > xmax) && !xmax_set_hard) ) + { + xmax_set = TRUE; + xmax = du_xmax; + } + } + } + + if (!xmax_set || !xmin_set) + { + *nbin = 0; + return; + } + + + if (!dx_set) + { + *nbin = nbin_default; + *dx = (xmax-(*xmin))/((*nbin)-2); /* -2 because we want the last bin to + be 0, and we count from 0 */ + } + else + { + *nbin = (xmax-(*xmin))/(*dx); + } + + if (*nbin > *nbin_alloc) + { + *nbin_alloc = *nbin; + srenew(*bin, *nbin_alloc); + } + + /* reset the histogram */ + for (i = 0; i < (*nbin); i++) + { + (*bin)[i] = 0; + } + + /* now add the actual data */ + for (i = 0; i < sc->nsamples; i++) + { + if (sc->s[i]->hist) + { + hist_t *hist = sc->s[i]->hist; + for (k = 0; k < hist->nhist; k++) + { + double hdx = hist->dx[k]; + double xmin_hist = hist->x0[k]*hdx; + for (j = 0; j < hist->nbin[k]; j++) + { + /* calculate the bin corresponding to the middle of the + original bin */ + double x = hdx*(j+0.5) + xmin_hist; + int binnr = (int)((x-(*xmin))/(*dx)); + + if (binnr >= *nbin || binnr < 0) + { + binnr = (*nbin)-1; + } + + (*bin)[binnr] += hist->bin[k][j]; + } + } + } + else + { + int starti = sc->r[i].start; + int endi = sc->r[i].end; + for (j = starti; j < endi; j++) + { + int binnr = (int)((sc->s[i]->du[j]-(*xmin))/(*dx)); + if (binnr >= *nbin || binnr < 0) + { + binnr = (*nbin)-1; + } + + (*bin)[binnr]++; + } + } + } +} + +/* write a collection of histograms to a file */ +void sim_data_histogram(sim_data_t *sd, const char *filename, + int nbin_default, const output_env_t oenv) +{ + char label_x[STRLEN]; + const char *dhdl = "dH/d\\lambda", *deltag = "\\DeltaH", *lambda = "\\lambda"; + const char *title = "N(\\DeltaH)"; + const char *label_y = "Samples"; + FILE *fp; + lambda_data_t *bl; + int nsets = 0; + char **setnames = NULL; + gmx_bool first_set = FALSE; + /* histogram data: */ + int *hist = NULL; + int nbin = 0; + int nbin_alloc = 0; + double dx = 0; + double min = 0; + int i; + lambda_data_t *bl_head = sd->lb; + + printf("\nWriting histogram to %s\n", filename); + sprintf(label_x, "\\DeltaH (%s)", unit_energy); + + fp = xvgropen_type(filename, title, label_x, label_y, exvggtXNY, oenv); + + /* first get all the set names */ + bl = bl_head->next; + /* iterate over all lambdas */ + while (bl != bl_head) + { + sample_coll_t *sc = bl->sc->next; + + /* iterate over all samples */ + while (sc != bl->sc) + { + char buf[STRLEN], buf2[STRLEN]; + + nsets++; + srenew(setnames, nsets); + snew(setnames[nsets-1], STRLEN); + if (sc->foreign_lambda->dhdl < 0) + { + lambda_vec_print(sc->native_lambda, buf, FALSE); + lambda_vec_print(sc->foreign_lambda, buf2, FALSE); + sprintf(setnames[nsets-1], "N(%s(%s=%s) | %s=%s)", + deltag, lambda, buf2, lambda, buf); + } + else + { + lambda_vec_print(sc->native_lambda, buf, FALSE); + sprintf(setnames[nsets-1], "N(%s | %s=%s)", + dhdl, lambda, buf); + } + sc = sc->next; + } + + bl = bl->next; + } + xvgr_legend(fp, nsets, (const char**)setnames, oenv); + + + /* now make the histograms */ + bl = bl_head->next; + /* iterate over all lambdas */ + while (bl != bl_head) + { + sample_coll_t *sc = bl->sc->next; + + /* iterate over all samples */ + while (sc != bl->sc) + { + if (!first_set) + { + xvgr_new_dataset(fp, 0, 0, NULL, oenv); + } + + sample_coll_make_hist(sc, &hist, &nbin_alloc, &nbin, &dx, &min, + nbin_default); + + for (i = 0; i < nbin; i++) + { + double xmin = i*dx + min; + double xmax = (i+1)*dx + min; + + fprintf(fp, "%g %d\n%g %d\n", xmin, hist[i], xmax, hist[i]); + } + + first_set = FALSE; + sc = sc->next; + } + + bl = bl->next; + } + + if (hist) + { + sfree(hist); + } + + xvgrclose(fp); +} + +/* create a collection (array) of barres_t object given a ordered linked list + of barlamda_t sample collections */ +static barres_t *barres_list_create(sim_data_t *sd, int *nres, + gmx_bool use_dhdl) +{ + lambda_data_t *bl; + int nlambda = 0; + barres_t *res; + int i; + gmx_bool dhdl = FALSE; + gmx_bool first = TRUE; + lambda_data_t *bl_head = sd->lb; + + /* first count the lambdas */ + bl = bl_head->next; + while (bl != bl_head) + { + nlambda++; + bl = bl->next; + } + snew(res, nlambda-1); + + /* next put the right samples in the res */ + *nres = 0; + bl = bl_head->next->next; /* we start with the second one. */ + while (bl != bl_head) + { + sample_coll_t *sc, *scprev; + barres_t *br = &(res[*nres]); + /* there is always a previous one. we search for that as a foreign + lambda: */ + scprev = lambda_data_find_sample_coll(bl->prev, bl->lambda); + sc = lambda_data_find_sample_coll(bl, bl->prev->lambda); + + barres_init(br); + + if (use_dhdl) + { + /* we use dhdl */ + + scprev = lambda_data_find_sample_coll(bl->prev, bl->prev->lambda); + sc = lambda_data_find_sample_coll(bl, bl->lambda); + + if (first) + { + printf("\nWARNING: Using the derivative data (dH/dlambda) to extrapolate delta H values.\nThis will only work if the Hamiltonian is linear in lambda.\n"); + dhdl = TRUE; + } + if (!dhdl) + { + gmx_fatal(FARGS, "Some dhdl files contain only one value (dH/dl), while others \ncontain multiple values (dH/dl and/or Delta H), will not proceed \nbecause of possible inconsistencies.\n"); + } + } + else if (!scprev && !sc) + { - gmx_fatal(FARGS, "There is no path from lambda=%g -> %g that is covered by foreign lambdas:\ncannot proceed with BAR.\nUse thermodynamic integration of dH/dl by calculating the averages of dH/dl\nwith g_analyze and integrating them.\nAlternatively, use the -extp option if (and only if) the Hamiltonian\ndepends linearly on lambda, which is NOT normally the case.\n", bl->prev->lambda, bl->lambda); ++ gmx_fatal(FARGS, "There is no path from lambda=%f -> %f that is covered by foreign lambdas:\ncannot proceed with BAR.\nUse thermodynamic integration of dH/dl by calculating the averages of dH/dl\nwith g_analyze and integrating them.\nAlternatively, use the -extp option if (and only if) the Hamiltonian\ndepends linearly on lambda, which is NOT normally the case.\n", bl->prev->lambda, bl->lambda); + } + + /* normal delta H */ + if (!scprev) + { - gmx_fatal(FARGS, "Could not find a set for foreign lambda = %g\nin the files for lambda = %g", bl->lambda, bl->prev->lambda); ++ gmx_fatal(FARGS, "Could not find a set for foreign lambda = %f\nin the files for lambda = %f", bl->lambda, bl->prev->lambda); + } + if (!sc) + { - gmx_fatal(FARGS, "Could not find a set for foreign lambda = %g\nin the files for lambda = %g", bl->prev->lambda, bl->lambda); ++ gmx_fatal(FARGS, "Could not find a set for foreign lambda = %f\nin the files for lambda = %f", bl->prev->lambda, bl->lambda); + } + br->a = scprev; + br->b = sc; + + first = FALSE; + (*nres)++; + bl = bl->next; + } + return res; +} + +/* estimate the maximum discretization error */ +static double barres_list_max_disc_err(barres_t *res, int nres) +{ + int i, j; + double disc_err = 0.; + double delta_lambda; + + for (i = 0; i < nres; i++) + { + barres_t *br = &(res[i]); + + delta_lambda = lambda_vec_abs_diff(br->b->native_lambda, + br->a->native_lambda); + + for (j = 0; j < br->a->nsamples; j++) + { + if (br->a->s[j]->hist) + { + double Wfac = 1.; + if (br->a->s[j]->derivative) + { + Wfac = delta_lambda; + } + + disc_err = max(disc_err, Wfac*br->a->s[j]->hist->dx[0]); + } + } + for (j = 0; j < br->b->nsamples; j++) + { + if (br->b->s[j]->hist) + { + double Wfac = 1.; + if (br->b->s[j]->derivative) + { + Wfac = delta_lambda; + } + disc_err = max(disc_err, Wfac*br->b->s[j]->hist->dx[0]); + } + } + } + return disc_err; +} + + +/* impose start and end times on a sample collection, updating sample_ranges */ +static void sample_coll_impose_times(sample_coll_t *sc, double begin_t, + double end_t) +{ + int i; + for (i = 0; i < sc->nsamples; i++) + { + samples_t *s = sc->s[i]; + sample_range_t *r = &(sc->r[i]); + if (s->hist) + { + double end_time = s->hist->delta_time*s->hist->sum + + s->hist->start_time; + if (s->hist->start_time < begin_t || end_time > end_t) + { + r->use = FALSE; + } + } + else + { + if (!s->t) + { + double end_time; + if (s->start_time < begin_t) + { + r->start = (int)((begin_t - s->start_time)/s->delta_time); + } + end_time = s->delta_time*s->ndu + s->start_time; + if (end_time > end_t) + { + r->end = (int)((end_t - s->start_time)/s->delta_time); + } + } + else + { + int j; + for (j = 0; j < s->ndu; j++) + { + if (s->t[j] < begin_t) + { + r->start = j; + } + + if (s->t[j] >= end_t) + { + r->end = j; + break; + } + } + } + if (r->start > r->end) + { + r->use = FALSE; + } + } + } + sample_coll_calc_ntot(sc); +} + +static void sim_data_impose_times(sim_data_t *sd, double begin, double end) +{ + double first_t, last_t; + double begin_t, end_t; + lambda_data_t *lc; + lambda_data_t *head = sd->lb; + int j; + + if (begin <= 0 && end < 0) + { + return; + } + + /* first determine the global start and end times */ + first_t = -1; + last_t = -1; + lc = head->next; + while (lc != head) + { + sample_coll_t *sc = lc->sc->next; + while (sc != lc->sc) + { + for (j = 0; j < sc->nsamples; j++) + { + double start_t, end_t; + + start_t = sc->s[j]->start_time; + end_t = sc->s[j]->start_time; + if (sc->s[j]->hist) + { + end_t += sc->s[j]->delta_time*sc->s[j]->hist->sum; + } + else + { + if (sc->s[j]->t) + { + end_t = sc->s[j]->t[sc->s[j]->ndu-1]; + } + else + { + end_t += sc->s[j]->delta_time*sc->s[j]->ndu; + } + } + + if (start_t < first_t || first_t < 0) + { + first_t = start_t; + } + if (end_t > last_t) + { + last_t = end_t; + } + } + sc = sc->next; + } + lc = lc->next; + } + + /* calculate the actual times */ + if (begin > 0) + { + begin_t = begin; + } + else + { + begin_t = first_t; + } + + if (end > 0) + { + end_t = end; + } + else + { + end_t = last_t; + } + printf("\n Samples in time interval: %.3f - %.3f\n", first_t, last_t); + + if (begin_t > end_t) + { + return; + } + printf("Removing samples outside of: %.3f - %.3f\n", begin_t, end_t); + + /* then impose them */ + lc = head->next; + while (lc != head) + { + sample_coll_t *sc = lc->sc->next; + while (sc != lc->sc) + { + sample_coll_impose_times(sc, begin_t, end_t); + sc = sc->next; + } + lc = lc->next; + } +} + + +/* create subsample i out of ni from an existing sample_coll */ +static gmx_bool sample_coll_create_subsample(sample_coll_t *sc, + sample_coll_t *sc_orig, + int i, int ni) +{ + int j; + int hist_start, hist_end; + + gmx_large_int_t ntot_start; + gmx_large_int_t ntot_end; + gmx_large_int_t ntot_so_far; + + *sc = *sc_orig; /* just copy all fields */ + + /* allocate proprietary memory */ + snew(sc->s, sc_orig->nsamples); + snew(sc->r, sc_orig->nsamples); + + /* copy the samples */ + for (j = 0; j < sc_orig->nsamples; j++) + { + sc->s[j] = sc_orig->s[j]; + sc->r[j] = sc_orig->r[j]; /* copy the ranges too */ + } + + /* now fix start and end fields */ + /* the casts avoid possible overflows */ + ntot_start = (gmx_large_int_t)(sc_orig->ntot*(double)i/(double)ni); + ntot_end = (gmx_large_int_t)(sc_orig->ntot*(double)(i+1)/(double)ni); + ntot_so_far = 0; + for (j = 0; j < sc->nsamples; j++) + { + gmx_large_int_t ntot_add; + gmx_large_int_t new_start, new_end; + + if (sc->r[j].use) + { + if (sc->s[j]->hist) + { + ntot_add = sc->s[j]->hist->sum; + } + else + { + ntot_add = sc->r[j].end - sc->r[j].start; + } + } + else + { + ntot_add = 0; + } + + if (!sc->s[j]->hist) + { + if (ntot_so_far < ntot_start) + { + /* adjust starting point */ + new_start = sc->r[j].start + (ntot_start - ntot_so_far); + } + else + { + new_start = sc->r[j].start; + } + /* adjust end point */ + new_end = sc->r[j].start + (ntot_end - ntot_so_far); + if (new_end > sc->r[j].end) + { + new_end = sc->r[j].end; + } + + /* check if we're in range at all */ + if ( (new_end < new_start) || (new_start > sc->r[j].end) ) + { + new_start = 0; + new_end = 0; + } + /* and write the new range */ + sc->r[j].start = (int)new_start; + sc->r[j].end = (int)new_end; + } + else + { + if (sc->r[j].use) + { + double overlap; + double ntot_start_norm, ntot_end_norm; + /* calculate the amount of overlap of the + desired range (ntot_start -- ntot_end) onto + the histogram range (ntot_so_far -- ntot_so_far+ntot_add)*/ + + /* first calculate normalized bounds + (where 0 is the start of the hist range, and 1 the end) */ + ntot_start_norm = (ntot_start-ntot_so_far)/(double)ntot_add; + ntot_end_norm = (ntot_end-ntot_so_far)/(double)ntot_add; + + /* now fix the boundaries */ + ntot_start_norm = min(1, max(0., ntot_start_norm)); + ntot_end_norm = max(0, min(1., ntot_end_norm)); + + /* and calculate the overlap */ + overlap = ntot_end_norm - ntot_start_norm; + + if (overlap > 0.95) /* we allow for 5% slack */ + { + sc->r[j].use = TRUE; + } + else if (overlap < 0.05) + { + sc->r[j].use = FALSE; + } + else + { + return FALSE; + } + } + } + ntot_so_far += ntot_add; + } + sample_coll_calc_ntot(sc); + + return TRUE; +} + +/* calculate minimum and maximum work values in sample collection */ +static void sample_coll_min_max(sample_coll_t *sc, double Wfac, + double *Wmin, double *Wmax) +{ + int i, j; + + *Wmin = FLT_MAX; + *Wmax = -FLT_MAX; + + for (i = 0; i < sc->nsamples; i++) + { + samples_t *s = sc->s[i]; + sample_range_t *r = &(sc->r[i]); + if (r->use) + { + if (!s->hist) + { + for (j = r->start; j < r->end; j++) + { + *Wmin = min(*Wmin, s->du[j]*Wfac); + *Wmax = max(*Wmax, s->du[j]*Wfac); + } + } + else + { + int hd = 0; /* determine the histogram direction: */ + double dx; + if ( (s->hist->nhist > 1) && (Wfac < 0) ) + { + hd = 1; + } + dx = s->hist->dx[hd]; + + for (j = s->hist->nbin[hd]-1; j >= 0; j--) + { + *Wmin = min(*Wmin, Wfac*(s->hist->x0[hd])*dx); + *Wmax = max(*Wmax, Wfac*(s->hist->x0[hd])*dx); + /* look for the highest value bin with values */ + if (s->hist->bin[hd][j] > 0) + { + *Wmin = min(*Wmin, Wfac*(j+s->hist->x0[hd]+1)*dx); + *Wmax = max(*Wmax, Wfac*(j+s->hist->x0[hd]+1)*dx); + break; + } + } + } + } + } +} + +/* Initialize a sim_data structure */ +static void sim_data_init(sim_data_t *sd) +{ + /* make linked list */ + sd->lb = &(sd->lb_head); + sd->lb->next = sd->lb; + sd->lb->prev = sd->lb; + + lambda_components_init(&(sd->lc)); +} + + +static double calc_bar_sum(int n, const double *W, double Wfac, double sbMmDG) +{ + int i; + double sum; + + sum = 0; + + for (i = 0; i < n; i++) + { + sum += 1./(1. + exp(Wfac*W[i] + sbMmDG)); + } + + return sum; +} + +/* calculate the BAR average given a histogram + + if type== 0, calculate the best estimate for the average, + if type==-1, calculate the minimum possible value given the histogram + if type== 1, calculate the maximum possible value given the histogram */ +static double calc_bar_sum_hist(const hist_t *hist, double Wfac, double sbMmDG, + int type) +{ + double sum = 0.; + int i; + int max; + /* normalization factor multiplied with bin width and + number of samples (we normalize through M): */ + double normdx = 1.; + int hd = 0; /* determine the histogram direction: */ + double dx; + + if ( (hist->nhist > 1) && (Wfac < 0) ) + { + hd = 1; + } + dx = hist->dx[hd]; + max = hist->nbin[hd]-1; + if (type == 1) + { + max = hist->nbin[hd]; /* we also add whatever was out of range */ + } + + for (i = 0; i < max; i++) + { + double x = Wfac*((i+hist->x0[hd])+0.5)*dx; /* bin middle */ + double pxdx = hist->bin[0][i]*normdx; /* p(x)dx */ + + sum += pxdx/(1. + exp(x + sbMmDG)); + } + + return sum; +} + +static double calc_bar_lowlevel(sample_coll_t *ca, sample_coll_t *cb, + double temp, double tol, int type) +{ + double kT, beta, M; + double DG; + int i, j; + double Wfac1, Wfac2, Wmin, Wmax; + double DG0, DG1, DG2, dDG1; + double sum1, sum2; + double n1, n2; /* numbers of samples as doubles */ + + kT = BOLTZ*temp; + beta = 1/kT; + + /* count the numbers of samples */ + n1 = ca->ntot; + n2 = cb->ntot; + + M = log(n1/n2); + + /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/ + if (ca->foreign_lambda->dhdl < 0) + { + /* this is the case when the delta U were calculated directly + (i.e. we're not scaling dhdl) */ + Wfac1 = beta; + Wfac2 = beta; + } + else + { + /* we're using dhdl, so delta_lambda needs to be a + multiplication factor. */ + /*double delta_lambda=cb->native_lambda-ca->native_lambda;*/ + double delta_lambda = lambda_vec_abs_diff(cb->native_lambda, + ca->native_lambda); + if (cb->native_lambda->lc->N > 1) + { + gmx_fatal(FARGS, + "Can't (yet) do multi-component dhdl interpolation"); + } + + Wfac1 = beta*delta_lambda; + Wfac2 = -beta*delta_lambda; + } + + if (beta < 1) + { + /* We print the output both in kT and kJ/mol. + * Here we determine DG in kT, so when beta < 1 + * the precision has to be increased. + */ + tol *= beta; + } + + /* Calculate minimum and maximum work to give an initial estimate of + * delta G as their average. + */ + { + double Wmin1, Wmin2, Wmax1, Wmax2; + sample_coll_min_max(ca, Wfac1, &Wmin1, &Wmax1); + sample_coll_min_max(cb, Wfac2, &Wmin2, &Wmax2); + + Wmin = min(Wmin1, Wmin2); + Wmax = max(Wmax1, Wmax2); + } + + DG0 = Wmin; + DG2 = Wmax; + + if (debug) + { + fprintf(debug, "DG %9.5f %9.5f\n", DG0, DG2); + } + /* We approximate by bisection: given our initial estimates + we keep checking whether the halfway point is greater or + smaller than what we get out of the BAR averages. + + For the comparison we can use twice the tolerance. */ + while (DG2 - DG0 > 2*tol) + { + DG1 = 0.5*(DG0 + DG2); + - /*printf("Wfac1=%g, Wfac2=%g, beta=%g, DG1=%g\n",Wfac1,Wfac2,beta, - DG1);*/ - + /* calculate the BAR averages */ + dDG1 = 0.; + + for (i = 0; i < ca->nsamples; i++) + { + samples_t *s = ca->s[i]; + sample_range_t *r = &(ca->r[i]); + if (r->use) + { + if (s->hist) + { + dDG1 += calc_bar_sum_hist(s->hist, Wfac1, (M-DG1), type); + } + else + { + dDG1 += calc_bar_sum(r->end - r->start, s->du + r->start, + Wfac1, (M-DG1)); + } + } + } + for (i = 0; i < cb->nsamples; i++) + { + samples_t *s = cb->s[i]; + sample_range_t *r = &(cb->r[i]); + if (r->use) + { + if (s->hist) + { + dDG1 -= calc_bar_sum_hist(s->hist, Wfac2, -(M-DG1), type); + } + else + { + dDG1 -= calc_bar_sum(r->end - r->start, s->du + r->start, + Wfac2, -(M-DG1)); + } + } + } + + if (dDG1 < 0) + { + DG0 = DG1; + } + else + { + DG2 = DG1; + } + if (debug) + { + fprintf(debug, "DG %9.5f %9.5f\n", DG0, DG2); + } + } + + return 0.5*(DG0 + DG2); +} + +static void calc_rel_entropy(sample_coll_t *ca, sample_coll_t *cb, + double temp, double dg, double *sa, double *sb) +{ + int i, j; + double W_ab = 0.; + double W_ba = 0.; + double kT, beta; + double Wfac1, Wfac2; + double n1, n2; + + kT = BOLTZ*temp; + beta = 1/kT; + + /* count the numbers of samples */ + n1 = ca->ntot; + n2 = cb->ntot; + + /* to ensure the work values are the same as during the delta_G */ + /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/ + if (ca->foreign_lambda->dhdl < 0) + { + /* this is the case when the delta U were calculated directly + (i.e. we're not scaling dhdl) */ + Wfac1 = beta; + Wfac2 = beta; + } + else + { + /* we're using dhdl, so delta_lambda needs to be a + multiplication factor. */ + double delta_lambda = lambda_vec_abs_diff(cb->native_lambda, + ca->native_lambda); + Wfac1 = beta*delta_lambda; + Wfac2 = -beta*delta_lambda; + } + + /* first calculate the average work in both directions */ + for (i = 0; i < ca->nsamples; i++) + { + samples_t *s = ca->s[i]; + sample_range_t *r = &(ca->r[i]); + if (r->use) + { + if (!s->hist) + { + for (j = r->start; j < r->end; j++) + { + W_ab += Wfac1*s->du[j]; + } + } + else + { + /* normalization factor multiplied with bin width and + number of samples (we normalize through M): */ + double normdx = 1.; + double dx; + int hd = 0; /* histogram direction */ + if ( (s->hist->nhist > 1) && (Wfac1 < 0) ) + { + hd = 1; + } + dx = s->hist->dx[hd]; + + for (j = 0; j < s->hist->nbin[0]; j++) + { + double x = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/ + double pxdx = s->hist->bin[0][j]*normdx; /* p(x)dx */ + W_ab += pxdx*x; + } + } + } + } + W_ab /= n1; + + for (i = 0; i < cb->nsamples; i++) + { + samples_t *s = cb->s[i]; + sample_range_t *r = &(cb->r[i]); + if (r->use) + { + if (!s->hist) + { + for (j = r->start; j < r->end; j++) + { + W_ba += Wfac1*s->du[j]; + } + } + else + { + /* normalization factor multiplied with bin width and + number of samples (we normalize through M): */ + double normdx = 1.; + double dx; + int hd = 0; /* histogram direction */ + if ( (s->hist->nhist > 1) && (Wfac2 < 0) ) + { + hd = 1; + } + dx = s->hist->dx[hd]; + + for (j = 0; j < s->hist->nbin[0]; j++) + { + double x = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/ + double pxdx = s->hist->bin[0][j]*normdx; /* p(x)dx */ + W_ba += pxdx*x; + } + } + } + } + W_ba /= n2; + + /* then calculate the relative entropies */ + *sa = (W_ab - dg); + *sb = (W_ba + dg); +} + +static void calc_dg_stddev(sample_coll_t *ca, sample_coll_t *cb, + double temp, double dg, double *stddev) +{ + int i, j; + double M; + double sigmafact = 0.; + double kT, beta; + double Wfac1, Wfac2; + double n1, n2; + + kT = BOLTZ*temp; + beta = 1/kT; + + /* count the numbers of samples */ + n1 = ca->ntot; + n2 = cb->ntot; + + /* to ensure the work values are the same as during the delta_G */ + /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/ + if (ca->foreign_lambda->dhdl < 0) + { + /* this is the case when the delta U were calculated directly + (i.e. we're not scaling dhdl) */ + Wfac1 = beta; + Wfac2 = beta; + } + else + { + /* we're using dhdl, so delta_lambda needs to be a + multiplication factor. */ + double delta_lambda = lambda_vec_abs_diff(cb->native_lambda, + ca->native_lambda); + Wfac1 = beta*delta_lambda; + Wfac2 = -beta*delta_lambda; + } + + M = log(n1/n2); + + + /* calculate average in both directions */ + for (i = 0; i < ca->nsamples; i++) + { + samples_t *s = ca->s[i]; + sample_range_t *r = &(ca->r[i]); + if (r->use) + { + if (!s->hist) + { + for (j = r->start; j < r->end; j++) + { + sigmafact += 1./(2. + 2.*cosh((M + Wfac1*s->du[j] - dg))); + } + } + else + { + /* normalization factor multiplied with bin width and + number of samples (we normalize through M): */ + double normdx = 1.; + double dx; + int hd = 0; /* histogram direction */ + if ( (s->hist->nhist > 1) && (Wfac1 < 0) ) + { + hd = 1; + } + dx = s->hist->dx[hd]; + + for (j = 0; j < s->hist->nbin[0]; j++) + { + double x = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/ + double pxdx = s->hist->bin[0][j]*normdx; /* p(x)dx */ + + sigmafact += pxdx/(2. + 2.*cosh((M + x - dg))); + } + } + } + } + for (i = 0; i < cb->nsamples; i++) + { + samples_t *s = cb->s[i]; + sample_range_t *r = &(cb->r[i]); + if (r->use) + { + if (!s->hist) + { + for (j = r->start; j < r->end; j++) + { + sigmafact += 1./(2. + 2.*cosh((M - Wfac2*s->du[j] - dg))); + } + } + else + { + /* normalization factor multiplied with bin width and + number of samples (we normalize through M): */ + double normdx = 1.; + double dx; + int hd = 0; /* histogram direction */ + if ( (s->hist->nhist > 1) && (Wfac2 < 0) ) + { + hd = 1; + } + dx = s->hist->dx[hd]; + + for (j = 0; j < s->hist->nbin[0]; j++) + { + double x = Wfac2*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/ + double pxdx = s->hist->bin[0][j]*normdx; /* p(x)dx */ + + sigmafact += pxdx/(2. + 2.*cosh((M - x - dg))); + } + } + } + } + + sigmafact /= (n1 + n2); + + + /* Eq. 10 from + Shirts, Bair, Hooker & Pande, Phys. Rev. Lett 91, 140601 (2003): */ + *stddev = sqrt(((1./sigmafact) - ( (n1+n2)/n1 + (n1+n2)/n2 ))); +} + + + +static void calc_bar(barres_t *br, double tol, + int npee_min, int npee_max, gmx_bool *bEE, + double *partsum) +{ + int npee, p; + double dg_sig2, sa_sig2, sb_sig2, stddev_sig2; /* intermediate variance values + for calculated quantities */ + int nsample1, nsample2; + double temp = br->a->temp; + int i, j; + double dg_min, dg_max; + gmx_bool have_hist = FALSE; + + br->dg = calc_bar_lowlevel(br->a, br->b, temp, tol, 0); + + br->dg_disc_err = 0.; + br->dg_histrange_err = 0.; + + /* check if there are histograms */ + for (i = 0; i < br->a->nsamples; i++) + { + if (br->a->r[i].use && br->a->s[i]->hist) + { + have_hist = TRUE; + break; + } + } + if (!have_hist) + { + for (i = 0; i < br->b->nsamples; i++) + { + if (br->b->r[i].use && br->b->s[i]->hist) + { + have_hist = TRUE; + break; + } + } + } + + /* calculate histogram-specific errors */ + if (have_hist) + { + dg_min = calc_bar_lowlevel(br->a, br->b, temp, tol, -1); + dg_max = calc_bar_lowlevel(br->a, br->b, temp, tol, 1); + + if (fabs(dg_max - dg_min) > GMX_REAL_EPS*10) + { + /* the histogram range error is the biggest of the differences + between the best estimate and the extremes */ + br->dg_histrange_err = fabs(dg_max - dg_min); + } + br->dg_disc_err = 0.; + for (i = 0; i < br->a->nsamples; i++) + { + if (br->a->s[i]->hist) + { + br->dg_disc_err = max(br->dg_disc_err, br->a->s[i]->hist->dx[0]); + } + } + for (i = 0; i < br->b->nsamples; i++) + { + if (br->b->s[i]->hist) + { + br->dg_disc_err = max(br->dg_disc_err, br->b->s[i]->hist->dx[0]); + } + } + } + calc_rel_entropy(br->a, br->b, temp, br->dg, &(br->sa), &(br->sb)); + + calc_dg_stddev(br->a, br->b, temp, br->dg, &(br->dg_stddev) ); + + dg_sig2 = 0; + sa_sig2 = 0; + sb_sig2 = 0; + stddev_sig2 = 0; + + *bEE = TRUE; + { + sample_coll_t ca, cb; + + /* initialize the samples */ + sample_coll_init(&ca, br->a->native_lambda, br->a->foreign_lambda, + br->a->temp); + sample_coll_init(&cb, br->b->native_lambda, br->b->foreign_lambda, + br->b->temp); + + for (npee = npee_min; npee <= npee_max; npee++) + { + double dgs = 0; + double dgs2 = 0; + double dsa = 0; + double dsb = 0; + double dsa2 = 0; + double dsb2 = 0; + double dstddev = 0; + double dstddev2 = 0; + + + for (p = 0; p < npee; p++) + { + double dgp; + double stddevc; + double sac, sbc; + gmx_bool cac, cbc; + + cac = sample_coll_create_subsample(&ca, br->a, p, npee); + cbc = sample_coll_create_subsample(&cb, br->b, p, npee); + + if (!cac || !cbc) + { + printf("WARNING: histogram number incompatible with block number for averaging: can't do error estimate\n"); + *bEE = FALSE; + if (cac) + { + sample_coll_destroy(&ca); + } + if (cbc) + { + sample_coll_destroy(&cb); + } + return; + } + + dgp = calc_bar_lowlevel(&ca, &cb, temp, tol, 0); + dgs += dgp; + dgs2 += dgp*dgp; + + partsum[npee*(npee_max+1)+p] += dgp; + + calc_rel_entropy(&ca, &cb, temp, dgp, &sac, &sbc); + dsa += sac; + dsa2 += sac*sac; + dsb += sbc; + dsb2 += sbc*sbc; + calc_dg_stddev(&ca, &cb, temp, dgp, &stddevc ); + + dstddev += stddevc; + dstddev2 += stddevc*stddevc; + + sample_coll_destroy(&ca); + sample_coll_destroy(&cb); + } + dgs /= npee; + dgs2 /= npee; + dg_sig2 += (dgs2-dgs*dgs)/(npee-1); + + dsa /= npee; + dsa2 /= npee; + dsb /= npee; + dsb2 /= npee; + sa_sig2 += (dsa2-dsa*dsa)/(npee-1); + sb_sig2 += (dsb2-dsb*dsb)/(npee-1); + + dstddev /= npee; + dstddev2 /= npee; + stddev_sig2 += (dstddev2-dstddev*dstddev)/(npee-1); + } + br->dg_err = sqrt(dg_sig2/(npee_max - npee_min + 1)); + br->sa_err = sqrt(sa_sig2/(npee_max - npee_min + 1)); + br->sb_err = sqrt(sb_sig2/(npee_max - npee_min + 1)); + br->dg_stddev_err = sqrt(stddev_sig2/(npee_max - npee_min + 1)); + } +} + + +static double bar_err(int nbmin, int nbmax, const double *partsum) +{ + int nb, b; + double svar, s, s2, dg; + + svar = 0; + for (nb = nbmin; nb <= nbmax; nb++) + { + s = 0; + s2 = 0; + for (b = 0; b < nb; b++) + { + dg = partsum[nb*(nbmax+1)+b]; + s += dg; + s2 += dg*dg; + } + s /= nb; + s2 /= nb; + svar += (s2 - s*s)/(nb - 1); + } + + return sqrt(svar/(nbmax + 1 - nbmin)); +} + + +/* Seek the end of an identifier (consecutive non-spaces), followed by + an optional number of spaces or '='-signs. Returns a pointer to the + first non-space value found after that. Returns NULL if the string + ends before that. + */ +static const char *find_value(const char *str) +{ + gmx_bool name_end_found = FALSE; + + /* if the string is a NULL pointer, return a NULL pointer. */ + if (str == NULL) + { + return NULL; + } + while (*str != '\0') + { + /* first find the end of the name */ + if (!name_end_found) + { + if (isspace(*str) || (*str == '=') ) + { + name_end_found = TRUE; + } + } + else + { + if (!( isspace(*str) || (*str == '=') )) + { + return str; + } + } + str++; + } + return NULL; +} + + + +/* read a vector-notation description of a lambda vector */ +static gmx_bool read_lambda_compvec(const char *str, + lambda_vec_t *lv, + const lambda_components_t *lc_in, + lambda_components_t *lc_out, + const char **end, + const char *fn) +{ + gmx_bool initialize_lc = FALSE; /* whether to initialize the lambda + components, or to check them */ + gmx_bool start_reached = FALSE; /* whether the start of component names + has been reached */ + gmx_bool vector = FALSE; /* whether there are multiple components */ + int n = 0; /* current component number */ + const char *val_start = NULL; /* start of the component name, or NULL + if not in a value */ + char *strtod_end; + gmx_bool OK = TRUE; + + if (end) + { + *end = str; + } + + + if (lc_out && lc_out->N == 0) + { + initialize_lc = TRUE; + } + + if (lc_in == NULL) + { + lc_in = lc_out; + } + + while (1) + { + if (!start_reached) + { + if (isalnum(*str)) + { + vector = FALSE; + start_reached = TRUE; + val_start = str; + } + else if (*str == '(') + { + vector = TRUE; + start_reached = TRUE; + } + else if (!isspace(*str)) + { + gmx_fatal(FARGS, "Error in lambda components in %s", fn); + } + } + else + { + if (val_start) + { + if (isspace(*str) || *str == ')' || *str == ',' || *str == '\0') + { + /* end of value */ + if (lv == NULL) + { + if (initialize_lc) + { + lambda_components_add(lc_out, val_start, + (str-val_start)); + } + else + { + if (!lambda_components_check(lc_out, n, val_start, + (str-val_start))) + { + return FALSE; + } + } + } + else + { + /* add a vector component to lv */ + lv->val[n] = strtod(val_start, &strtod_end); + if (val_start == strtod_end) + { + gmx_fatal(FARGS, + "Error reading lambda vector in %s", fn); + } + } + /* reset for the next identifier */ + val_start = NULL; + n++; + if (!vector) + { + return OK; + } + } + } + else if (isalnum(*str)) + { + val_start = str; + } + if (*str == ')') + { + str++; + if (end) + { + *end = str; + } + if (!vector) + { + gmx_fatal(FARGS, "Error in lambda components in %s", fn); + } + else + { + if (n == lc_in->N) + { + return OK; + } + else if (lv == NULL) + { + return FALSE; + } + else + { + gmx_fatal(FARGS, "Incomplete lambda vector data in %s", + fn); + return FALSE; + } + + } + } + } + if (*str == '\0') + { + break; + } + str++; + if (end) + { + *end = str; + } + } + if (vector) + { + gmx_fatal(FARGS, "Incomplete lambda components data in %s", fn); + return FALSE; + } + return OK; +} + +/* read and check the component names from a string */ +static gmx_bool read_lambda_components(const char *str, + lambda_components_t *lc, + const char **end, + const char *fn) +{ + return read_lambda_compvec(str, NULL, NULL, lc, end, fn); +} + +/* read an initialized lambda vector from a string */ +static gmx_bool read_lambda_vector(const char *str, + lambda_vec_t *lv, + const char **end, + const char *fn) +{ + return read_lambda_compvec(str, lv, lv->lc, NULL, end, fn); +} + + + +/* deduce lambda value from legend. + fn = the file name + legend = the legend string + ba = the xvg data + lam = the initialized lambda vector + returns whether to use the data in this set. + */ +static gmx_bool legend2lambda(const char *fn, + const char *legend, + xvg_t *ba, + lambda_vec_t *lam) +{ + double lambda = 0; + const char *ptr = NULL, *ptr2 = NULL; + gmx_bool ok = FALSE; + gmx_bool bdhdl = FALSE; + const char *tostr = " to "; + + if (legend == NULL) + { + gmx_fatal(FARGS, "There is no legend in file '%s', can not deduce lambda", fn); + } + + /* look for the last 'to': */ + ptr2 = legend; + do + { + ptr2 = strstr(ptr2, tostr); + if (ptr2 != NULL) + { + ptr = ptr2; + ptr2++; + } + } + while (ptr2 != NULL && *ptr2 != '\0'); + + if (ptr) + { + ptr += strlen(tostr)-1; /* and advance past that 'to' */ + } + else + { + /* look for the = sign */ + ptr = strrchr(legend, '='); + if (!ptr) + { + /* otherwise look for the last space */ + ptr = strrchr(legend, ' '); + } + } + + if (strstr(legend, "dH")) + { + ok = TRUE; + bdhdl = TRUE; + } + else if (strchr(legend, 'D') != NULL && strchr(legend, 'H') != NULL) + { + ok = TRUE; + bdhdl = FALSE; + } + else /*if (strstr(legend, "pV"))*/ + { + return FALSE; + } + if (!ptr) + { + ok = FALSE; + } + + if (!ok) + { + gmx_fatal(FARGS, "There is no proper lambda legend in file '%s', can not deduce lambda", fn); + } + if (!bdhdl) + { + ptr = find_value(ptr); + if (!ptr || !read_lambda_vector(ptr, lam, NULL, fn)) + { + gmx_fatal(FARGS, "lambda vector '%s' %s faulty", legend, fn); + } + } + else + { + int dhdl_index; + const char *end; + char buf[STRLEN]; + + ptr = strrchr(legend, '='); + end = ptr; + if (ptr) + { + /* there must be a component name */ + ptr--; + if (ptr < legend) + { + gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn); + } + /* now backtrack to the start of the identifier */ + while (isspace(*ptr)) + { + end = ptr; + ptr--; + if (ptr < legend) + { + gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn); + } + } + while (!isspace(*ptr)) + { + ptr--; + if (ptr < legend) + { + gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn); + } + } + ptr++; + strncpy(buf, ptr, (end-ptr)); + buf[(end-ptr)] = '\0'; + dhdl_index = lambda_components_find(lam->lc, ptr, (end-ptr)); + if (dhdl_index < 0) + { + char buf[STRLEN]; + strncpy(buf, ptr, (end-ptr)); + buf[(end-ptr)] = '\0'; + gmx_fatal(FARGS, + "Did not find lambda component for '%s' in %s", + buf, fn); + } + } + else + { + if (lam->lc->N > 1) + { + gmx_fatal(FARGS, + "dhdl without component name with >1 lambda component in %s", + fn); + } + dhdl_index = 0; + } + lam->dhdl = dhdl_index; + } + return TRUE; +} + +static gmx_bool subtitle2lambda(const char *subtitle, xvg_t *ba, const char *fn, + lambda_components_t *lc) +{ + gmx_bool bFound; + const char *ptr; + char *end; + double native_lambda; + + bFound = FALSE; + + /* first check for a state string */ + ptr = strstr(subtitle, "state"); + if (ptr) + { + int index = -1; + const char *val_end; + + /* the new 4.6 style lambda vectors */ + ptr = find_value(ptr); + if (ptr) + { + index = strtol(ptr, &end, 10); + if (ptr == end) + { + gmx_fatal(FARGS, "Incomplete state data in %s", fn); + return FALSE; + } + ptr = end; + } + else + { + gmx_fatal(FARGS, "Incomplete state data in %s", fn); + return FALSE; + } + /* now find the lambda vector component names */ + while (*ptr != '(' && !isalnum(*ptr)) + { + ptr++; + if (*ptr == '\0') + { + gmx_fatal(FARGS, + "Incomplete lambda vector component data in %s", fn); + return FALSE; + } + } + val_end = ptr; + if (!read_lambda_components(ptr, lc, &val_end, fn)) + { + gmx_fatal(FARGS, + "lambda vector components in %s don't match those previously read", + fn); + } + ptr = find_value(val_end); + if (!ptr) + { + gmx_fatal(FARGS, "Incomplete state data in %s", fn); + return FALSE; + } + lambda_vec_init(&(ba->native_lambda), lc); + if (!read_lambda_vector(ptr, &(ba->native_lambda), NULL, fn)) + { + gmx_fatal(FARGS, "lambda vector in %s faulty", fn); + } + ba->native_lambda.index = index; + bFound = TRUE; + } + else + { + /* compatibility mode: check for lambda in other ways. */ + /* plain text lambda string */ + ptr = strstr(subtitle, "lambda"); + if (ptr == NULL) + { + /* xmgrace formatted lambda string */ + ptr = strstr(subtitle, "\\xl\\f{}"); + } + if (ptr == NULL) + { + /* xmgr formatted lambda string */ + ptr = strstr(subtitle, "\\8l\\4"); + } + if (ptr != NULL) + { + ptr = strstr(ptr, "="); + } + if (ptr != NULL) + { + bFound = (sscanf(ptr+1, "%lf", &(native_lambda)) == 1); + /* add the lambda component name as an empty string */ + if (lc->N > 0) + { + if (!lambda_components_check(lc, 0, "", 0)) + { + gmx_fatal(FARGS, + "lambda vector components in %s don't match those previously read", + fn); + } + } + else + { + lambda_components_add(lc, "", 0); + } + lambda_vec_init(&(ba->native_lambda), lc); + ba->native_lambda.val[0] = native_lambda; + } + } + + return bFound; +} + +static void filename2lambda(const char *fn, xvg_t *ba) +{ + double lambda; + const char *ptr, *digitptr; + char *endptr; + int dirsep; + ptr = fn; + /* go to the end of the path string and search backward to find the last + directory in the path which has to contain the value of lambda + */ + while (ptr[1] != '\0') + { + ptr++; + } + /* searching backward to find the second directory separator */ + dirsep = 0; + digitptr = NULL; + while (ptr >= fn) + { + if (ptr[0] != DIR_SEPARATOR && ptr[1] == DIR_SEPARATOR) + { + if (dirsep == 1) + { + break; + } + dirsep++; + } + /* save the last position of a digit between the last two + separators = in the last dirname */ + if (dirsep > 0 && isdigit(*ptr)) + { + digitptr = ptr; + } + ptr--; + } + if (!digitptr) + { + gmx_fatal(FARGS, "While trying to read the lambda value from the file path:" + " last directory in the path '%s' does not contain a number", fn); + } + if (digitptr[-1] == '-') + { + digitptr--; + } + lambda = strtod(digitptr, &endptr); + if (endptr == digitptr) + { + gmx_fatal(FARGS, "Malformed number in file path '%s'", fn); + } +} + +static void read_bar_xvg_lowlevel(const char *fn, real *temp, xvg_t *ba, + lambda_components_t *lc) +{ + int i; + char *subtitle, **legend, *ptr; + int np; + gmx_bool native_lambda_read = FALSE; + char buf[STRLEN]; + lambda_vec_t lv; + + xvg_init(ba); + + ba->filename = fn; + + np = read_xvg_legend(fn, &ba->y, &ba->nset, &subtitle, &legend); + if (!ba->y) + { + gmx_fatal(FARGS, "File %s contains no usable data.", fn); + } + /* Reorder the data */ + ba->t = ba->y[0]; + for (i = 1; i < ba->nset; i++) + { + ba->y[i-1] = ba->y[i]; + } + ba->nset--; + + snew(ba->np, ba->nset); + for (i = 0; i < ba->nset; i++) + { + ba->np[i] = np; + } + + ba->temp = -1; + if (subtitle != NULL) + { + /* try to extract temperature */ + ptr = strstr(subtitle, "T ="); + if (ptr != NULL) + { + ptr += 3; + if (sscanf(ptr, "%lf", &ba->temp) == 1) + { + if (ba->temp <= 0) + { - gmx_fatal(FARGS, "Found temperature of %g in file '%s'", ++ gmx_fatal(FARGS, "Found temperature of %f in file '%s'", + ba->temp, fn); + } + } + } + } + if (ba->temp < 0) + { + if (*temp <= 0) + { + gmx_fatal(FARGS, "Did not find a temperature in the subtitle in file '%s', use the -temp option of [TT]g_bar[tt]", fn); + } + ba->temp = *temp; + } + + /* Try to deduce lambda from the subtitle */ + if (subtitle) + { + if (subtitle2lambda(subtitle, ba, fn, lc)) + { + native_lambda_read = TRUE; + } + } + snew(ba->lambda, ba->nset); + if (legend == NULL) + { + /* Check if we have a single set, no legend, nset=1 means t and dH/dl */ + if (ba->nset == 1) + { + if (!native_lambda_read) + { + /* Deduce lambda from the file name */ + filename2lambda(fn, ba); + native_lambda_read = TRUE; + } + ba->lambda[0] = ba->native_lambda; + } + else + { + gmx_fatal(FARGS, "File %s contains multiple sets but no legends, can not determine the lambda values", fn); + } + } + else + { + for (i = 0; i < ba->nset; ) + { + gmx_bool use = FALSE; + /* Read lambda from the legend */ + lambda_vec_init( &(ba->lambda[i]), lc ); + lambda_vec_copy( &(ba->lambda[i]), &(ba->native_lambda)); + use = legend2lambda(fn, legend[i], ba, &(ba->lambda[i])); + if (use) + { + lambda_vec_print(&(ba->lambda[i]), buf, FALSE); + i++; + } + else + { + int j; + printf("%s: Ignoring set '%s'.\n", fn, legend[i]); + for (j = i+1; j < ba->nset; j++) + { + ba->y[j-1] = ba->y[j]; + legend[j-1] = legend[j]; + } + ba->nset--; + } + } + } + + if (!native_lambda_read) + { + gmx_fatal(FARGS, "File %s contains multiple sets but no indication of the native lambda", fn); + } + + if (legend != NULL) + { + for (i = 0; i < ba->nset-1; i++) + { + sfree(legend[i]); + } + sfree(legend); + } +} + +static void read_bar_xvg(char *fn, real *temp, sim_data_t *sd) +{ + xvg_t *barsim; + samples_t *s; + int i; + double *lambda; + + snew(barsim, 1); + + read_bar_xvg_lowlevel(fn, temp, barsim, &(sd->lc)); + + if (barsim->nset < 1) + { + gmx_fatal(FARGS, "File '%s' contains fewer than two columns", fn); + } + + if (!gmx_within_tol(*temp, barsim->temp, GMX_FLOAT_EPS) && (*temp > 0) ) + { + gmx_fatal(FARGS, "Temperature in file %s different from earlier files or setting\n", fn); + } + *temp = barsim->temp; + + /* now create a series of samples_t */ + snew(s, barsim->nset); + for (i = 0; i < barsim->nset; i++) + { + samples_init(s+i, &(barsim->native_lambda), &(barsim->lambda[i]), + barsim->temp, lambda_vec_same(&(barsim->native_lambda), + &(barsim->lambda[i])), + fn); + s[i].du = barsim->y[i]; + s[i].ndu = barsim->np[i]; + s[i].t = barsim->t; + + lambda_data_list_insert_sample(sd->lb, s+i); + } + { + char buf[STRLEN]; + + lambda_vec_print(s[0].native_lambda, buf, FALSE); + printf("%s: %.1f - %.1f; lambda = %s\n dH/dl & foreign lambdas:\n", + fn, s[0].t[0], s[0].t[s[0].ndu-1], buf); + for (i = 0; i < barsim->nset; i++) + { + lambda_vec_print(s[i].foreign_lambda, buf, TRUE); + printf(" %s (%d pts)\n", buf, s[i].ndu); + } + } + printf("\n\n"); +} + +static void read_edr_rawdh_block(samples_t **smp, int *ndu, t_enxblock *blk, + double start_time, double delta_time, + lambda_vec_t *native_lambda, double temp, + double *last_t, const char *filename) +{ + int i, j; + gmx_bool allocated; + double old_foreign_lambda; + lambda_vec_t *foreign_lambda; + int type; + samples_t *s; /* convenience pointer */ + int startj; + + /* check the block types etc. */ + if ( (blk->nsub < 3) || + (blk->sub[0].type != xdr_datatype_int) || + (blk->sub[1].type != xdr_datatype_double) || + ( + (blk->sub[2].type != xdr_datatype_float) && + (blk->sub[2].type != xdr_datatype_double) + ) || + (blk->sub[0].nr < 1) || + (blk->sub[1].nr < 1) ) + { + gmx_fatal(FARGS, - "Unexpected/corrupted block data in file %s around time %g.", ++ "Unexpected/corrupted block data in file %s around time %f.", + filename, start_time); + } + + snew(foreign_lambda, 1); + lambda_vec_init(foreign_lambda, native_lambda->lc); + lambda_vec_copy(foreign_lambda, native_lambda); + type = blk->sub[0].ival[0]; + if (type == dhbtDH) + { + for (i = 0; i < native_lambda->lc->N; i++) + { + foreign_lambda->val[i] = blk->sub[1].dval[i]; + } + } + else + { + if (blk->sub[0].nr > 1) + { + foreign_lambda->dhdl = blk->sub[0].ival[1]; + } + else + { + foreign_lambda->dhdl = 0; + } + } + + if (!*smp) + { + /* initialize the samples structure if it's empty. */ + snew(*smp, 1); + samples_init(*smp, native_lambda, foreign_lambda, temp, + type == dhbtDHDL, filename); + (*smp)->start_time = start_time; + (*smp)->delta_time = delta_time; + } + + /* set convenience pointer */ + s = *smp; + + /* now double check */ + if (!lambda_vec_same(s->foreign_lambda, foreign_lambda) ) + { + char buf[STRLEN], buf2[STRLEN]; + lambda_vec_print(foreign_lambda, buf, FALSE); + lambda_vec_print(s->foreign_lambda, buf2, FALSE); + fprintf(stderr, "Got foreign lambda=%s, expected: %s\n", buf, buf2); - gmx_fatal(FARGS, "Corrupted data in file %s around t=%g.", ++ gmx_fatal(FARGS, "Corrupted data in file %s around t=%f.", + filename, start_time); + } + + /* make room for the data */ + if (s->ndu_alloc < (size_t)(s->ndu + blk->sub[2].nr) ) + { + s->ndu_alloc += (s->ndu_alloc < (size_t)blk->sub[2].nr) ? + blk->sub[2].nr*2 : s->ndu_alloc; + srenew(s->du_alloc, s->ndu_alloc); + s->du = s->du_alloc; + } + startj = s->ndu; + s->ndu += blk->sub[2].nr; + s->ntot += blk->sub[2].nr; + *ndu = blk->sub[2].nr; + + /* and copy the data*/ + for (j = 0; j < blk->sub[2].nr; j++) + { + if (blk->sub[2].type == xdr_datatype_float) + { + s->du[startj+j] = blk->sub[2].fval[j]; + } + else + { + s->du[startj+j] = blk->sub[2].dval[j]; + } + } + if (start_time + blk->sub[2].nr*delta_time > *last_t) + { + *last_t = start_time + blk->sub[2].nr*delta_time; + } +} + +static samples_t *read_edr_hist_block(int *nsamples, t_enxblock *blk, + double start_time, double delta_time, + lambda_vec_t *native_lambda, double temp, + double *last_t, const char *filename) +{ + int i, j; + samples_t *s; + int nhist; + double old_foreign_lambda; + lambda_vec_t *foreign_lambda; + int type; + int nbins[2]; + + /* check the block types etc. */ + if ( (blk->nsub < 2) || + (blk->sub[0].type != xdr_datatype_double) || + (blk->sub[1].type != xdr_datatype_large_int) || + (blk->sub[0].nr < 2) || + (blk->sub[1].nr < 2) ) + { + gmx_fatal(FARGS, - "Unexpected/corrupted block data in file %s around time %g", ++ "Unexpected/corrupted block data in file %s around time %f", + filename, start_time); + } + + nhist = blk->nsub-2; + if (nhist == 0) + { + return NULL; + } + if (nhist > 2) + { + gmx_fatal(FARGS, - "Unexpected/corrupted block data in file %s around time %g", ++ "Unexpected/corrupted block data in file %s around time %f", + filename, start_time); + } + + snew(s, 1); + *nsamples = 1; + + snew(foreign_lambda, 1); + lambda_vec_init(foreign_lambda, native_lambda->lc); + lambda_vec_copy(foreign_lambda, native_lambda); + type = (int)(blk->sub[1].lval[1]); + if (type == dhbtDH) + { + double old_foreign_lambda; + + old_foreign_lambda = blk->sub[0].dval[0]; + if (old_foreign_lambda >= 0) + { + foreign_lambda->val[0] = old_foreign_lambda; + if (foreign_lambda->lc->N > 1) + { + gmx_fatal(FARGS, + "Single-component lambda in multi-component file %s", + filename); + } + } + else + { + for (i = 0; i < native_lambda->lc->N; i++) + { + foreign_lambda->val[i] = blk->sub[0].dval[i+2]; + } + } + } + else + { + if (foreign_lambda->lc->N > 1) + { + if (blk->sub[1].nr < 3 + nhist) + { + gmx_fatal(FARGS, + "Missing derivative coord in multi-component file %s", + filename); + } + foreign_lambda->dhdl = blk->sub[1].lval[2 + nhist]; + } + else + { + foreign_lambda->dhdl = 0; + } + } + + samples_init(s, native_lambda, foreign_lambda, temp, type == dhbtDHDL, + filename); + snew(s->hist, 1); + + for (i = 0; i < nhist; i++) + { + nbins[i] = blk->sub[i+2].nr; + } + + hist_init(s->hist, nhist, nbins); + + for (i = 0; i < nhist; i++) + { + s->hist->x0[i] = blk->sub[1].lval[2+i]; + s->hist->dx[i] = blk->sub[0].dval[1]; + if (i == 1) + { + s->hist->dx[i] = -s->hist->dx[i]; + } + } + + s->hist->start_time = start_time; + s->hist->delta_time = delta_time; + s->start_time = start_time; + s->delta_time = delta_time; + + for (i = 0; i < nhist; i++) + { + int nbin; + gmx_large_int_t sum = 0; + + for (j = 0; j < s->hist->nbin[i]; j++) + { + int binv = (int)(blk->sub[i+2].ival[j]); + + s->hist->bin[i][j] = binv; + sum += binv; + + } + if (i == 0) + { + s->ntot = sum; + s->hist->sum = sum; + } + else + { + if (s->ntot != sum) + { + gmx_fatal(FARGS, "Histogram counts don't match in %s", + filename); + } + } + } + + if (start_time + s->hist->sum*delta_time > *last_t) + { + *last_t = start_time + s->hist->sum*delta_time; + } + return s; +} + + +static void read_barsim_edr(char *fn, real *temp, sim_data_t *sd) +{ + int i, j; + ener_file_t fp; + t_enxframe *fr; + int nre; + gmx_enxnm_t *enm = NULL; + double first_t = -1; + double last_t = -1; + samples_t **samples_rawdh = NULL; /* contains samples for raw delta_h */ + int *nhists = NULL; /* array to keep count & print at end */ + int *npts = NULL; /* array to keep count & print at end */ + lambda_vec_t **lambdas = NULL; /* array to keep count & print at end */ + lambda_vec_t *native_lambda; + double end_time; /* the end time of the last batch of samples */ + int nsamples = 0; + lambda_vec_t start_lambda; + + fp = open_enx(fn, "r"); + do_enxnms(fp, &nre, &enm); + snew(fr, 1); + + snew(native_lambda, 1); + start_lambda.lc = NULL; + + while (do_enx(fp, fr)) + { + /* count the data blocks */ + int nblocks_raw = 0; + int nblocks_hist = 0; + int nlam = 0; + int k; + /* DHCOLL block information: */ + double start_time = 0, delta_time = 0, old_start_lambda = 0, delta_lambda = 0; + double rtemp = 0; + + /* count the blocks and handle collection information: */ + for (i = 0; i < fr->nblock; i++) + { + if (fr->block[i].id == enxDHHIST) + { + nblocks_hist++; + } + if (fr->block[i].id == enxDH) + { + nblocks_raw++; + } + if (fr->block[i].id == enxDHCOLL) + { + nlam++; + if ( (fr->block[i].nsub < 1) || + (fr->block[i].sub[0].type != xdr_datatype_double) || + (fr->block[i].sub[0].nr < 5)) + { + gmx_fatal(FARGS, "Unexpected block data in file %s", fn); + } + + /* read the data from the DHCOLL block */ + rtemp = fr->block[i].sub[0].dval[0]; + start_time = fr->block[i].sub[0].dval[1]; + delta_time = fr->block[i].sub[0].dval[2]; + old_start_lambda = fr->block[i].sub[0].dval[3]; + delta_lambda = fr->block[i].sub[0].dval[4]; + + if (delta_lambda > 0) + { + gmx_fatal(FARGS, "Lambda values not constant in %s: can't apply BAR method", fn); + } + if ( ( *temp != rtemp) && (*temp > 0) ) + { + gmx_fatal(FARGS, "Temperature in file %s different from earlier files or setting\n", fn); + } + *temp = rtemp; + + if (old_start_lambda >= 0) + { + if (sd->lc.N > 0) + { + if (!lambda_components_check(&(sd->lc), 0, "", 0)) + { + gmx_fatal(FARGS, + "lambda vector components in %s don't match those previously read", + fn); + } + } + else + { + lambda_components_add(&(sd->lc), "", 0); + } + if (!start_lambda.lc) + { + lambda_vec_init(&start_lambda, &(sd->lc)); + } + start_lambda.val[0] = old_start_lambda; + } + else + { + /* read lambda vector */ + int n_lambda_vec; + gmx_bool check = (sd->lc.N > 0); + if (fr->block[i].nsub < 2) + { + gmx_fatal(FARGS, - "No lambda vector, but start_lambda=%g\n", ++ "No lambda vector, but start_lambda=%f\n", + old_start_lambda); + } + n_lambda_vec = fr->block[i].sub[1].ival[1]; + for (j = 0; j < n_lambda_vec; j++) + { + const char *name = + efpt_singular_names[fr->block[i].sub[1].ival[1+j]]; + if (check) + { + /* check the components */ + lambda_components_check(&(sd->lc), j, name, + strlen(name)); + } + else + { + lambda_components_add(&(sd->lc), name, + strlen(name)); + } + } + lambda_vec_init(&start_lambda, &(sd->lc)); + start_lambda.index = fr->block[i].sub[1].ival[0]; + for (j = 0; j < n_lambda_vec; j++) + { + start_lambda.val[j] = fr->block[i].sub[0].dval[5+j]; + } + } + if (first_t < 0) + { + first_t = start_time; + } + } + } + + if (nlam != 1) + { + gmx_fatal(FARGS, "Did not find delta H information in file %s", fn); + } + if (nblocks_raw > 0 && nblocks_hist > 0) + { + gmx_fatal(FARGS, "Can't handle both raw delta U data and histograms in the same file %s", fn); + } + + if (nsamples > 0) + { + /* check the native lambda */ + if (!lambda_vec_same(&start_lambda, native_lambda) ) + { - gmx_fatal(FARGS, "Native lambda not constant in file %s: started at %g, and becomes %g at time %g", ++ gmx_fatal(FARGS, "Native lambda not constant in file %s: started at %f, and becomes %f at time %f", + fn, native_lambda, start_lambda, start_time); + } + /* check the number of samples against the previous number */ + if ( ((nblocks_raw+nblocks_hist) != nsamples) || (nlam != 1 ) ) + { + gmx_fatal(FARGS, "Unexpected block count in %s: was %d, now %d\n", + fn, nsamples+1, nblocks_raw+nblocks_hist+nlam); + } + /* check whether last iterations's end time matches with + the currrent start time */ + if ( (fabs(last_t - start_time) > 2*delta_time) && last_t >= 0) + { + /* it didn't. We need to store our samples and reallocate */ + for (i = 0; i < nsamples; i++) + { + if (samples_rawdh[i]) + { + /* insert it into the existing list */ + lambda_data_list_insert_sample(sd->lb, + samples_rawdh[i]); + /* and make sure we'll allocate a new one this time + around */ + samples_rawdh[i] = NULL; + } + } + } + } + else + { + /* this is the first round; allocate the associated data + structures */ + /*native_lambda=start_lambda;*/ + lambda_vec_init(native_lambda, &(sd->lc)); + lambda_vec_copy(native_lambda, &start_lambda); + nsamples = nblocks_raw+nblocks_hist; + snew(nhists, nsamples); + snew(npts, nsamples); + snew(lambdas, nsamples); + snew(samples_rawdh, nsamples); + for (i = 0; i < nsamples; i++) + { + nhists[i] = 0; + npts[i] = 0; + lambdas[i] = NULL; + samples_rawdh[i] = NULL; /* init to NULL so we know which + ones contain values */ + } + } + + /* and read them */ + k = 0; /* counter for the lambdas, etc. arrays */ + for (i = 0; i < fr->nblock; i++) + { + if (fr->block[i].id == enxDH) + { + int type = (fr->block[i].sub[0].ival[0]); + if (type == dhbtDH || type == dhbtDHDL) + { + int ndu; + read_edr_rawdh_block(&(samples_rawdh[k]), + &ndu, + &(fr->block[i]), + start_time, delta_time, + native_lambda, rtemp, + &last_t, fn); + npts[k] += ndu; + if (samples_rawdh[k]) + { + lambdas[k] = samples_rawdh[k]->foreign_lambda; + } + k++; + } + } + else if (fr->block[i].id == enxDHHIST) + { + int type = (int)(fr->block[i].sub[1].lval[1]); + if (type == dhbtDH || type == dhbtDHDL) + { + int j; + int nb = 0; + samples_t *s; /* this is where the data will go */ + s = read_edr_hist_block(&nb, &(fr->block[i]), + start_time, delta_time, + native_lambda, rtemp, + &last_t, fn); + nhists[k] += nb; + if (nb > 0) + { + lambdas[k] = s->foreign_lambda; + } + k++; + /* and insert the new sample immediately */ + for (j = 0; j < nb; j++) + { + lambda_data_list_insert_sample(sd->lb, s+j); + } + } + } + } + } + /* Now store all our extant sample collections */ + for (i = 0; i < nsamples; i++) + { + if (samples_rawdh[i]) + { + /* insert it into the existing list */ + lambda_data_list_insert_sample(sd->lb, samples_rawdh[i]); + } + } + + + { + char buf[STRLEN]; + printf("\n"); + lambda_vec_print(native_lambda, buf, FALSE); + printf("%s: %.1f - %.1f; lambda = %s\n foreign lambdas:\n", + fn, first_t, last_t, buf); + for (i = 0; i < nsamples; i++) + { + if (lambdas[i]) + { + lambda_vec_print(lambdas[i], buf, TRUE); + if (nhists[i] > 0) + { + printf(" %s (%d hists)\n", buf, nhists[i]); + } + else + { + printf(" %s (%d pts)\n", buf, npts[i]); + } + } + } + } + printf("\n\n"); + sfree(npts); + sfree(nhists); + sfree(lambdas); +} + + +int gmx_bar(int argc, char *argv[]) +{ + static const char *desc[] = { + "[TT]g_bar[tt] calculates free energy difference estimates through ", + "Bennett's acceptance ratio method (BAR). It also automatically", + "adds series of individual free energies obtained with BAR into", + "a combined free energy estimate.[PAR]", + + "Every individual BAR free energy difference relies on two ", + "simulations at different states: say state A and state B, as", + "controlled by a parameter, [GRK]lambda[grk] (see the [TT].mdp[tt] parameter", + "[TT]init_lambda[tt]). The BAR method calculates a ratio of weighted", + "average of the Hamiltonian difference of state B given state A and", + "vice versa.", + "The energy differences to the other state must be calculated", + "explicitly during the simulation. This can be done with", + "the [TT].mdp[tt] option [TT]foreign_lambda[tt].[PAR]", + + "Input option [TT]-f[tt] expects multiple [TT]dhdl.xvg[tt] files. ", + "Two types of input files are supported:[BR]", + "[TT]*[tt] Files with more than one [IT]y[it]-value. ", + "The files should have columns ", + "with dH/d[GRK]lambda[grk] and [GRK]Delta[grk][GRK]lambda[grk]. ", + "The [GRK]lambda[grk] values are inferred ", + "from the legends: [GRK]lambda[grk] of the simulation from the legend of ", + "dH/d[GRK]lambda[grk] and the foreign [GRK]lambda[grk] values from the ", + "legends of Delta H", + "[BR]", + "[TT]*[tt] Files with only one [IT]y[it]-value. Using the", + "[TT]-extp[tt] option for these files, it is assumed", + "that the [IT]y[it]-value is dH/d[GRK]lambda[grk] and that the ", + "Hamiltonian depends linearly on [GRK]lambda[grk]. ", + "The [GRK]lambda[grk] value of the simulation is inferred from the ", + "subtitle (if present), otherwise from a number in the subdirectory ", + "in the file name.[PAR]", + + "The [GRK]lambda[grk] of the simulation is parsed from ", + "[TT]dhdl.xvg[tt] file's legend containing the string 'dH', the ", + "foreign [GRK]lambda[grk] values from the legend containing the ", + "capitalized letters 'D' and 'H'. The temperature is parsed from ", + "the legend line containing 'T ='.[PAR]", + + "The input option [TT]-g[tt] expects multiple [TT].edr[tt] files. ", + "These can contain either lists of energy differences (see the ", + "[TT].mdp[tt] option [TT]separate_dhdl_file[tt]), or a series of ", + "histograms (see the [TT].mdp[tt] options [TT]dh_hist_size[tt] and ", + "[TT]dh_hist_spacing[tt]).", "The temperature and [GRK]lambda[grk] ", + "values are automatically deduced from the [TT]ener.edr[tt] file.[PAR]", + + "In addition to the [TT].mdp[tt] option [TT]foreign_lambda[tt], ", + "the energy difference can also be extrapolated from the ", + "dH/d[GRK]lambda[grk] values. This is done with the[TT]-extp[tt]", + "option, which assumes that the system's Hamiltonian depends linearly", + "on [GRK]lambda[grk], which is not normally the case.[PAR]", + + "The free energy estimates are determined using BAR with bisection, ", + "with the precision of the output set with [TT]-prec[tt]. ", + "An error estimate taking into account time correlations ", + "is made by splitting the data into blocks and determining ", + "the free energy differences over those blocks and assuming ", + "the blocks are independent. ", + "The final error estimate is determined from the average variance ", + "over 5 blocks. A range of block numbers for error estimation can ", + "be provided with the options [TT]-nbmin[tt] and [TT]-nbmax[tt].[PAR]", + + "[TT]g_bar[tt] tries to aggregate samples with the same 'native' and ", + "'foreign' [GRK]lambda[grk] values, but always assumes independent ", + "samples. [BB]Note[bb] that when aggregating energy ", + "differences/derivatives with different sampling intervals, this is ", + "almost certainly not correct. Usually subsequent energies are ", + "correlated and different time intervals mean different degrees ", + "of correlation between samples.[PAR]", + + "The results are split in two parts: the last part contains the final ", + "results in kJ/mol, together with the error estimate for each part ", + "and the total. The first part contains detailed free energy ", + "difference estimates and phase space overlap measures in units of ", + "kT (together with their computed error estimate). The printed ", + "values are:[BR]", + "[TT]*[tt] lam_A: the [GRK]lambda[grk] values for point A.[BR]", + "[TT]*[tt] lam_B: the [GRK]lambda[grk] values for point B.[BR]", + "[TT]*[tt] DG: the free energy estimate.[BR]", + "[TT]*[tt] s_A: an estimate of the relative entropy of B in A.[BR]", - "[TT]*[tt] s_A: an estimate of the relative entropy of A in B.[BR]", ++ "[TT]*[tt] s_B: an estimate of the relative entropy of A in B.[BR]", + "[TT]*[tt] stdev: an estimate expected per-sample standard deviation.[PAR]", + + "The relative entropy of both states in each other's ensemble can be ", + "interpreted as a measure of phase space overlap: ", + "the relative entropy s_A of the work samples of lambda_B in the ", + "ensemble of lambda_A (and vice versa for s_B), is a ", + "measure of the 'distance' between Boltzmann distributions of ", + "the two states, that goes to zero for identical distributions. See ", + "Wu & Kofke, J. Chem. Phys. 123 084109 (2005) for more information.", + "[PAR]", + "The estimate of the expected per-sample standard deviation, as given ", + "in Bennett's original BAR paper: Bennett, J. Comp. Phys. 22, p 245 (1976).", + "Eq. 10 therein gives an estimate of the quality of sampling (not directly", + "of the actual statistical error, because it assumes independent samples).[PAR]", + + "To get a visual estimate of the phase space overlap, use the ", + "[TT]-oh[tt] option to write series of histograms, together with the ", + "[TT]-nbin[tt] option.[PAR]" + }; + static real begin = 0, end = -1, temp = -1; + int nd = 2, nbmin = 5, nbmax = 5; + int nbin = 100; + gmx_bool use_dhdl = FALSE; + gmx_bool calc_s, calc_v; + t_pargs pa[] = { + { "-b", FALSE, etREAL, {&begin}, "Begin time for BAR" }, + { "-e", FALSE, etREAL, {&end}, "End time for BAR" }, + { "-temp", FALSE, etREAL, {&temp}, "Temperature (K)" }, + { "-prec", FALSE, etINT, {&nd}, "The number of digits after the decimal point" }, + { "-nbmin", FALSE, etINT, {&nbmin}, "Minimum number of blocks for error estimation" }, + { "-nbmax", FALSE, etINT, {&nbmax}, "Maximum number of blocks for error estimation" }, + { "-nbin", FALSE, etINT, {&nbin}, "Number of bins for histogram output"}, + { "-extp", FALSE, etBOOL, {&use_dhdl}, "Whether to linearly extrapolate dH/dl values to use as energies"} + }; + + t_filenm fnm[] = { + { efXVG, "-f", "dhdl", ffOPTRDMULT }, + { efEDR, "-g", "ener", ffOPTRDMULT }, + { efXVG, "-o", "bar", ffOPTWR }, + { efXVG, "-oi", "barint", ffOPTWR }, + { efXVG, "-oh", "histogram", ffOPTWR } + }; +#define NFILE asize(fnm) + + int f, i, j; + int nf = 0; /* file counter */ + int nbs; + int nfile_tot; /* total number of input files */ + int nxvgfile = 0; + int nedrfile = 0; + char **fxvgnms; + char **fedrnms; + sim_data_t sim_data; /* the simulation data */ + barres_t *results; /* the results */ + int nresults; /* number of results in results array */ + + double *partsum; + double prec, dg_tot, dg, sig, dg_tot_max, dg_tot_min; + FILE *fpb, *fpi; + char dgformat[20], xvg2format[STRLEN], xvg3format[STRLEN]; + char buf[STRLEN], buf2[STRLEN]; + char ktformat[STRLEN], sktformat[STRLEN]; + char kteformat[STRLEN], skteformat[STRLEN]; + output_env_t oenv; + double kT, beta; + gmx_bool result_OK = TRUE, bEE = TRUE; + + gmx_bool disc_err = FALSE; + double sum_disc_err = 0.; /* discretization error */ + gmx_bool histrange_err = FALSE; + double sum_histrange_err = 0.; /* histogram range error */ + double stat_err = 0.; /* statistical error */ + + parse_common_args(&argc, argv, + PCA_CAN_VIEW, + NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL, &oenv); + + if (opt2bSet("-f", NFILE, fnm)) + { + nxvgfile = opt2fns(&fxvgnms, "-f", NFILE, fnm); + } + if (opt2bSet("-g", NFILE, fnm)) + { + nedrfile = opt2fns(&fedrnms, "-g", NFILE, fnm); + } + + sim_data_init(&sim_data); +#if 0 + /* make linked list */ + lb = &lambda_head; + lambda_data_init(lb, 0, 0); + lb->next = lb; + lb->prev = lb; +#endif + + + nfile_tot = nxvgfile + nedrfile; + + if (nfile_tot == 0) + { + gmx_fatal(FARGS, "No input files!"); + } + + if (nd < 0) + { + gmx_fatal(FARGS, "Can not have negative number of digits"); + } + prec = pow(10, -nd); + + snew(partsum, (nbmax+1)*(nbmax+1)); + nf = 0; + + /* read in all files. First xvg files */ + for (f = 0; f < nxvgfile; f++) + { + read_bar_xvg(fxvgnms[f], &temp, &sim_data); + nf++; + } + /* then .edr files */ + for (f = 0; f < nedrfile; f++) + { + read_barsim_edr(fedrnms[f], &temp, &sim_data);; + nf++; + } + + /* fix the times to allow for equilibration */ + sim_data_impose_times(&sim_data, begin, end); + + if (opt2bSet("-oh", NFILE, fnm)) + { + sim_data_histogram(&sim_data, opt2fn("-oh", NFILE, fnm), nbin, oenv); + } + + /* assemble the output structures from the lambdas */ + results = barres_list_create(&sim_data, &nresults, use_dhdl); + + sum_disc_err = barres_list_max_disc_err(results, nresults); + + if (nresults == 0) + { + printf("\nNo results to calculate.\n"); + return 0; + } + + if (sum_disc_err > prec) + { + prec = sum_disc_err; + nd = ceil(-log10(prec)); + printf("WARNING: setting the precision to %g because that is the minimum\n reasonable number, given the expected discretization error.\n", prec); + } + + + /*sprintf(lamformat,"%%6.3f");*/ + sprintf( dgformat, "%%%d.%df", 3+nd, nd); + /* the format strings of the results in kT */ + sprintf( ktformat, "%%%d.%df", 5+nd, nd); + sprintf( sktformat, "%%%ds", 6+nd); + /* the format strings of the errors in kT */ + sprintf( kteformat, "%%%d.%df", 3+nd, nd); + sprintf( skteformat, "%%%ds", 4+nd); + sprintf(xvg2format, "%s %s\n", "%s", dgformat); + sprintf(xvg3format, "%s %s %s\n", "%s", dgformat, dgformat); + + + + fpb = NULL; + if (opt2bSet("-o", NFILE, fnm)) + { + sprintf(buf, "%s (%s)", "\\DeltaG", "kT"); + fpb = xvgropen_type(opt2fn("-o", NFILE, fnm), "Free energy differences", + "\\lambda", buf, exvggtXYDY, oenv); + } + + fpi = NULL; + if (opt2bSet("-oi", NFILE, fnm)) + { + sprintf(buf, "%s (%s)", "\\DeltaG", "kT"); + fpi = xvgropen(opt2fn("-oi", NFILE, fnm), "Free energy integral", + "\\lambda", buf, oenv); + } + + + + if (nbmin > nbmax) + { + nbmin = nbmax; + } + + /* first calculate results */ + bEE = TRUE; + disc_err = FALSE; + for (f = 0; f < nresults; f++) + { + /* Determine the free energy difference with a factor of 10 + * more accuracy than requested for printing. + */ + calc_bar(&(results[f]), 0.1*prec, nbmin, nbmax, + &bEE, partsum); + + if (results[f].dg_disc_err > prec/10.) + { + disc_err = TRUE; + } + if (results[f].dg_histrange_err > prec/10.) + { + histrange_err = TRUE; + } + } + + /* print results in kT */ + kT = BOLTZ*temp; + beta = 1/kT; + + printf("\nTemperature: %g K\n", temp); + + printf("\nDetailed results in kT (see help for explanation):\n\n"); + printf("%6s ", " lam_A"); + printf("%6s ", " lam_B"); + printf(sktformat, "DG "); + if (bEE) + { + printf(skteformat, "+/- "); + } + if (disc_err) + { + printf(skteformat, "disc "); + } + if (histrange_err) + { + printf(skteformat, "range "); + } + printf(sktformat, "s_A "); + if (bEE) + { + printf(skteformat, "+/- " ); + } + printf(sktformat, "s_B "); + if (bEE) + { + printf(skteformat, "+/- " ); + } + printf(sktformat, "stdev "); + if (bEE) + { + printf(skteformat, "+/- "); + } + printf("\n"); + for (f = 0; f < nresults; f++) + { + lambda_vec_print_short(results[f].a->native_lambda, buf); + printf("%s ", buf); + lambda_vec_print_short(results[f].b->native_lambda, buf); + printf("%s ", buf); + printf(ktformat, results[f].dg); + printf(" "); + if (bEE) + { + printf(kteformat, results[f].dg_err); + printf(" "); + } + if (disc_err) + { + printf(kteformat, results[f].dg_disc_err); + printf(" "); + } + if (histrange_err) + { + printf(kteformat, results[f].dg_histrange_err); + printf(" "); + } + printf(ktformat, results[f].sa); + printf(" "); + if (bEE) + { + printf(kteformat, results[f].sa_err); + printf(" "); + } + printf(ktformat, results[f].sb); + printf(" "); + if (bEE) + { + printf(kteformat, results[f].sb_err); + printf(" "); + } + printf(ktformat, results[f].dg_stddev); + printf(" "); + if (bEE) + { + printf(kteformat, results[f].dg_stddev_err); + } + printf("\n"); + + /* Check for negative relative entropy with a 95% certainty. */ + if (results[f].sa < -2*results[f].sa_err || + results[f].sb < -2*results[f].sb_err) + { + result_OK = FALSE; + } + } + + if (!result_OK) + { + printf("\nWARNING: Some of these results violate the Second Law of " + "Thermodynamics: \n" + " This is can be the result of severe undersampling, or " + "(more likely)\n" + " there is something wrong with the simulations.\n"); + } + + + /* final results in kJ/mol */ + printf("\n\nFinal results in kJ/mol:\n\n"); + dg_tot = 0; + for (f = 0; f < nresults; f++) + { + + if (fpi != NULL) + { + lambda_vec_print_short(results[f].a->native_lambda, buf); + fprintf(fpi, xvg2format, buf, dg_tot); + } + + + if (fpb != NULL) + { + lambda_vec_print_intermediate(results[f].a->native_lambda, + results[f].b->native_lambda, + buf); + + fprintf(fpb, xvg3format, buf, results[f].dg, results[f].dg_err); + } + + printf("point "); + lambda_vec_print_short(results[f].a->native_lambda, buf); + lambda_vec_print_short(results[f].b->native_lambda, buf2); + printf("%s - %s", buf, buf2); + printf(", DG "); + + printf(dgformat, results[f].dg*kT); + if (bEE) + { + printf(" +/- "); + printf(dgformat, results[f].dg_err*kT); + } + if (histrange_err) + { + printf(" (max. range err. = "); + printf(dgformat, results[f].dg_histrange_err*kT); + printf(")"); + sum_histrange_err += results[f].dg_histrange_err*kT; + } + + printf("\n"); + dg_tot += results[f].dg; + } + printf("\n"); + printf("total "); + lambda_vec_print_short(results[0].a->native_lambda, buf); + lambda_vec_print_short(results[nresults-1].b->native_lambda, buf2); + printf("%s - %s", buf, buf2); + printf(", DG "); + + printf(dgformat, dg_tot*kT); + if (bEE) + { + stat_err = bar_err(nbmin, nbmax, partsum)*kT; + printf(" +/- "); + printf(dgformat, max(max(stat_err, sum_disc_err), sum_histrange_err)); + } + printf("\n"); + if (disc_err) + { + printf("\nmaximum discretization error = "); + printf(dgformat, sum_disc_err); + if (bEE && stat_err < sum_disc_err) + { + printf("WARNING: discretization error (%g) is larger than statistical error.\n Decrease histogram spacing for more accurate results\n", stat_err); + } + } + if (histrange_err) + { + printf("\nmaximum histogram range error = "); + printf(dgformat, sum_histrange_err); + if (bEE && stat_err < sum_histrange_err) + { + printf("WARNING: histogram range error (%g) is larger than statistical error.\n Increase histogram range for more accurate results\n", stat_err); + } + + } + printf("\n"); + + + if (fpi != NULL) + { + lambda_vec_print_short(results[nresults-1].b->native_lambda, buf); + fprintf(fpi, xvg2format, buf, dg_tot); + ffclose(fpi); + } + + do_view(oenv, opt2fn_null("-o", NFILE, fnm), "-xydy"); + do_view(oenv, opt2fn_null("-oi", NFILE, fnm), "-xydy"); + + thanx(stderr); + + return 0; +} diff --cc src/gromacs/gmxana/gmx_genion.c index 4a24ea2038,0000000000..2302ac8f1d mode 100644,000000..100644 --- a/src/gromacs/gmxana/gmx_genion.c +++ b/src/gromacs/gmxana/gmx_genion.c @@@ -1,661 -1,0 +1,559 @@@ +/* + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Green Red Orange Magenta Azure Cyan Skyblue + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include "copyrite.h" +#include "string2.h" +#include "smalloc.h" +#include "sysstuff.h" +#include "confio.h" +#include "statutil.h" +#include "pbc.h" +#include "force.h" +#include "gmx_fatal.h" +#include "futil.h" +#include "maths.h" +#include "macros.h" - #include "physics.h" +#include "vec.h" +#include "tpxio.h" +#include "mdrun.h" - #include "calcpot.h" +#include "main.h" +#include "random.h" +#include "index.h" +#include "mtop_util.h" +#include "gmx_ana.h" + +static int greatest_common_divisor(int p, int q) +{ + int tmp; + while (q != 0) + { + tmp = q; + q = p % q; + p = tmp; + } + return p; +} + +static void insert_ion(int nsa, int *nwater, + gmx_bool bSet[], int repl[], atom_id index[], - real pot[], rvec x[], t_pbc *pbc, ++ rvec x[], t_pbc *pbc, + int sign, int q, const char *ionname, - t_mdatoms *mdatoms, - real rmin, gmx_bool bRandom, int *seed) ++ t_atoms *atoms, ++ real rmin, int *seed) +{ - int i, ii, ei, owater, wlast, m, nw; - real extr_e, poti, rmin2; - rvec xei, dx; - gmx_bool bSub = FALSE; ++ int i, ei,nw; ++ real rmin2; ++ rvec dx; + gmx_large_int_t maxrand; + + ei = -1; + nw = *nwater; + maxrand = nw; + maxrand *= 1000; - if (bRandom) ++ ++ do + { - do - { - ei = nw*rando(seed); - maxrand--; - } - while (bSet[ei] && (maxrand > 0)); - if (bSet[ei]) - { - gmx_fatal(FARGS, "No more replaceable solvent!"); - } ++ ei = nw*rando(seed); ++ maxrand--; + } - else ++ while (bSet[ei] && (maxrand > 0)); ++ if (bSet[ei]) + { - extr_e = 0; - for (i = 0; (i < nw); i++) - { - if (!bSet[i]) - { - ii = index[nsa*i]; - poti = pot[ii]; - if (q > 0) - { - if ((poti <= extr_e) || !bSub) - { - extr_e = poti; - ei = i; - bSub = TRUE; - } - } - else - { - if ((poti >= extr_e) || !bSub) - { - extr_e = poti; - ei = i; - bSub = TRUE; - } - } - } - } - if (ei == -1) - { - gmx_fatal(FARGS, "No more replaceable solvent!"); - } ++ gmx_fatal(FARGS, "No more replaceable solvent!"); + } ++ + fprintf(stderr, "Replacing solvent molecule %d (atom %d) with %s\n", + ei, index[nsa*ei], ionname); + + /* Replace solvent molecule charges with ion charge */ + bSet[ei] = TRUE; + repl[ei] = sign; - mdatoms->chargeA[index[nsa*ei]] = q; ++ ++ atoms->atom[index[nsa*ei]].q = q; + for (i = 1; i < nsa; i++) + { - mdatoms->chargeA[index[nsa*ei+i]] = 0; ++ atoms->atom[index[nsa*ei+i]].q = 0; + } + + /* Mark all solvent molecules within rmin as unavailable for substitution */ + if (rmin > 0) + { + rmin2 = rmin*rmin; + for (i = 0; (i < nw); i++) + { + if (!bSet[i]) + { + pbc_dx(pbc, x[index[nsa*ei]], x[index[nsa*i]], dx); + if (iprod(dx, dx) < rmin2) + { + bSet[i] = TRUE; + } + } + } + } +} + ++ +static char *aname(const char *mname) +{ + char *str; + int i; + + str = strdup(mname); + i = strlen(str)-1; + while (i > 1 && (isdigit(str[i]) || (str[i] == '+') || (str[i] == '-'))) + { + str[i] = '\0'; + i--; + } + + return str; +} + +void sort_ions(int nsa, int nw, int repl[], atom_id index[], + t_atoms *atoms, rvec x[], + const char *p_name, const char *n_name) +{ + int i, j, k, r, np, nn, starta, startr, npi, nni; + rvec *xt; + char **pptr = NULL, **nptr = NULL, **paptr = NULL, **naptr = NULL; + + snew(xt, atoms->nr); + + /* Put all the solvent in front and count the added ions */ + np = 0; + nn = 0; + j = index[0]; + for (i = 0; i < nw; i++) + { + r = repl[i]; + if (r == 0) + { + for (k = 0; k < nsa; k++) + { + copy_rvec(x[index[nsa*i+k]], xt[j++]); + } + } + else if (r > 0) + { + np++; + } + else if (r < 0) + { + nn++; + } + } + + if (np+nn > 0) + { + /* Put the positive and negative ions at the end */ + starta = index[nsa*(nw - np - nn)]; + startr = atoms->atom[starta].resind; + + if (np) + { + snew(pptr, 1); + pptr[0] = strdup(p_name); + snew(paptr, 1); + paptr[0] = aname(p_name); + } + if (nn) + { + snew(nptr, 1); + nptr[0] = strdup(n_name); + snew(naptr, 1); + naptr[0] = aname(n_name); + } + npi = 0; + nni = 0; + for (i = 0; i < nw; i++) + { + r = repl[i]; + if (r > 0) + { + j = starta+npi; + k = startr+npi; + copy_rvec(x[index[nsa*i]], xt[j]); + atoms->atomname[j] = paptr; + atoms->atom[j].resind = k; + atoms->resinfo[k].name = pptr; + npi++; + } + else if (r < 0) + { + j = starta+np+nni; + k = startr+np+nni; + copy_rvec(x[index[nsa*i]], xt[j]); + atoms->atomname[j] = naptr; + atoms->atom[j].resind = k; + atoms->resinfo[k].name = nptr; + nni++; + } + } + for (i = index[nsa*nw-1]+1; i < atoms->nr; i++) + { + j = i-(nsa-1)*(np+nn); + atoms->atomname[j] = atoms->atomname[i]; + atoms->atom[j] = atoms->atom[i]; + copy_rvec(x[i], xt[j]); + } + atoms->nr -= (nsa-1)*(np+nn); + + /* Copy the new positions back */ + for (i = index[0]; i < atoms->nr; i++) + { + copy_rvec(xt[i], x[i]); + } + sfree(xt); + } +} + +static void update_topol(const char *topinout, int p_num, int n_num, + const char *p_name, const char *n_name, char *grpname) +{ +#define TEMP_FILENM "temp.top" + FILE *fpin, *fpout; + char buf[STRLEN], buf2[STRLEN], *temp, **mol_line = NULL; + int line, i, nsol, nmol_line, sol_line, nsol_last; + gmx_bool bMolecules; + + printf("\nProcessing topology\n"); + fpin = ffopen(topinout, "r"); + fpout = ffopen(TEMP_FILENM, "w"); + + line = 0; + bMolecules = FALSE; + nmol_line = 0; + sol_line = -1; + nsol_last = -1; + while (fgets(buf, STRLEN, fpin)) + { + line++; + strcpy(buf2, buf); + if ((temp = strchr(buf2, '\n')) != NULL) + { + temp[0] = '\0'; + } + ltrim(buf2); + if (buf2[0] == '[') + { + buf2[0] = ' '; + if ((temp = strchr(buf2, '\n')) != NULL) + { + temp[0] = '\0'; + } + rtrim(buf2); + if (buf2[strlen(buf2)-1] == ']') + { + buf2[strlen(buf2)-1] = '\0'; + ltrim(buf2); + rtrim(buf2); + bMolecules = (gmx_strcasecmp(buf2, "molecules") == 0); + } + fprintf(fpout, "%s", buf); + } + else if (!bMolecules) + { + fprintf(fpout, "%s", buf); + } + else + { + /* Check if this is a line with solvent molecules */ + sscanf(buf, "%s", buf2); + if (gmx_strcasecmp(buf2, grpname) == 0) + { + sol_line = nmol_line; + sscanf(buf, "%*s %d", &nsol_last); + } + /* Store this molecules section line */ + srenew(mol_line, nmol_line+1); + mol_line[nmol_line] = strdup(buf); + nmol_line++; + } + } + ffclose(fpin); + + if (sol_line == -1) + { + ffclose(fpout); + gmx_fatal(FARGS, "No line with moleculetype '%s' found the [ molecules ] section of file '%s'", grpname, topinout); + } + if (nsol_last < p_num+n_num) + { + ffclose(fpout); + gmx_fatal(FARGS, "The last entry for moleculetype '%s' in the [ molecules ] section of file '%s' has less solvent molecules (%d) than were replaced (%d)", grpname, topinout, nsol_last, p_num+n_num); + } + + /* Print all the molecule entries */ + for (i = 0; i < nmol_line; i++) + { + if (i != sol_line) + { + fprintf(fpout, "%s", mol_line[i]); + } + else + { + printf("Replacing %d solute molecules in topology file (%s) " + " by %d %s and %d %s ions.\n", + p_num+n_num, topinout, p_num, p_name, n_num, n_name); + nsol_last -= p_num + n_num; + if (nsol_last > 0) + { + fprintf(fpout, "%-10s %d\n", grpname, nsol_last); + } + if (p_num > 0) + { + fprintf(fpout, "%-15s %d\n", p_name, p_num); + } + if (n_num > 0) + { + fprintf(fpout, "%-15s %d\n", n_name, n_num); + } + } + } + ffclose(fpout); + /* use ffopen to generate backup of topinout */ + fpout = ffopen(topinout, "w"); + ffclose(fpout); + rename(TEMP_FILENM, topinout); +#undef TEMP_FILENM +} + +int gmx_genion(int argc, char *argv[]) +{ + const char *desc[] = { - "[TT]genion[tt] replaces solvent molecules by monoatomic ions at", - "the position of the first atoms with the most favorable electrostatic", - "potential or at random. The potential is calculated on all atoms, using", - "normal GROMACS particle-based methods (in contrast to other methods", - "based on solving the Poisson-Boltzmann equation).", - "The potential is recalculated after every ion insertion.", - "If specified in the run input file, a reaction field, shift function", - "or user function can be used. For the user function a table file", - "can be specified with the option [TT]-table[tt].", ++ "[TT]genion[tt] randomly replaces solvent molecules with monoatomic ions.", + "The group of solvent molecules should be continuous and all molecules", + "should have the same number of atoms.", + "The user should add the ion molecules to the topology file or use", + "the [TT]-p[tt] option to automatically modify the topology.[PAR]", + "The ion molecule type, residue and atom names in all force fields", + "are the capitalized element names without sign. This molecule name", + "should be given with [TT]-pname[tt] or [TT]-nname[tt], and the", + "[TT][molecules][tt] section of your topology updated accordingly,", + "either by hand or with [TT]-p[tt]. Do not use an atom name instead!", + "[PAR]Ions which can have multiple charge states get the multiplicity", + "added, without sign, for the uncommon states only.[PAR]", - "With the option [TT]-pot[tt] the potential can be written as B-factors", - "in a [TT].pdb[tt] file (for visualisation using e.g. Rasmol).", - "The unit of the potential is 1000 kJ/(mol e), the scaling be changed", - "with the [TT]-scale[tt] option.[PAR]", + "For larger ions, e.g. sulfate we recommended using [TT]genbox[tt]." + }; + const char *bugs[] = { - "Calculation of the potential is not reliable, therefore the [TT]-random[tt] option is now turned on by default.", - "If you specify a salt concentration existing ions are not taken into account. In effect you therefore specify the amount of salt to be added." ++ "If you specify a salt concentration existing ions are not taken into " ++ "account. In effect you therefore specify the amount of salt to be added.", + }; + static int p_num = 0, n_num = 0, p_q = 1, n_q = -1; + static const char *p_name = "NA", *n_name = "CL"; - static real rmin = 0.6, scale = 0.001, conc = 0; ++ static real rmin = 0.6, conc = 0; + static int seed = 1993; - static gmx_bool bRandom = TRUE, bNeutral = FALSE; ++ static gmx_bool bNeutral = FALSE; + static t_pargs pa[] = { + { "-np", FALSE, etINT, {&p_num}, "Number of positive ions" }, + { "-pname", FALSE, etSTR, {&p_name}, "Name of the positive ion" }, + { "-pq", FALSE, etINT, {&p_q}, "Charge of the positive ion" }, + { "-nn", FALSE, etINT, {&n_num}, "Number of negative ions" }, + { "-nname", FALSE, etSTR, {&n_name}, "Name of the negative ion" }, + { "-nq", FALSE, etINT, {&n_q}, "Charge of the negative ion" }, + { "-rmin", FALSE, etREAL, {&rmin}, "Minimum distance between ions" }, - { "-random", FALSE, etBOOL, {&bRandom}, "Use random placement of ions instead of based on potential. The rmin option should still work" }, + { "-seed", FALSE, etINT, {&seed}, "Seed for random number generator" }, - { "-scale", FALSE, etREAL, {&scale}, "Scaling factor for the potential for [TT]-pot[tt]" }, + { "-conc", FALSE, etREAL, {&conc}, + "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." }, + { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "} + }; - gmx_mtop_t *mtop; - gmx_localtop_t *top; - t_inputrec inputrec; - t_commrec *cr; - t_mdatoms *mdatoms; - gmx_enerdata_t enerd; - t_graph *graph; - t_forcerec *fr; ++ t_topology top; + rvec *x, *v; - real *pot, vol, qtot; ++ real vol, qtot; + matrix box; + t_atoms atoms; + t_pbc pbc; - int *repl; ++ int *repl, ePBC; + atom_id *index; - char *grpname; - gmx_bool *bSet, bPDB; ++ char *grpname, title[STRLEN]; ++ gmx_bool *bSet; + int i, nw, nwa, nsa, nsalt, iqtot; - FILE *fplog; + output_env_t oenv; + t_filenm fnm[] = { + { efTPX, NULL, NULL, ffREAD }, - { efXVG, "-table", "table", ffOPTRD }, + { efNDX, NULL, NULL, ffOPTRD }, + { efSTO, "-o", NULL, ffWRITE }, - { efLOG, "-g", "genion", ffWRITE }, - { efPDB, "-pot", "pot", ffOPTWR }, + { efTOP, "-p", "topol", ffOPTRW } + }; +#define NFILE asize(fnm) + + parse_common_args(&argc, argv, PCA_BE_NICE, NFILE, fnm, asize(pa), pa, + asize(desc), desc, asize(bugs), bugs, &oenv); - bPDB = ftp2bSet(efPDB, NFILE, fnm); - if (bRandom && bPDB) - { - fprintf(stderr, "Not computing potential with random option!\n"); - bPDB = FALSE; - } + + /* Check input for something sensible */ + if ((p_num < 0) || (n_num < 0)) + { + gmx_fatal(FARGS, "Negative number of ions to add?"); + } + - snew(mtop, 1); - snew(top, 1); - fplog = init_calcpot(ftp2fn(efLOG, NFILE, fnm), ftp2fn(efTPX, NFILE, fnm), - opt2fn("-table", NFILE, fnm), mtop, top, &inputrec, &cr, - &graph, &mdatoms, &fr, &enerd, &pot, box, &x, oenv); ++ if (conc > 0 && (p_num > 0 || n_num > 0)) ++ { ++ fprintf(stderr, "WARNING: -conc specified, overriding -nn and -np.\n"); ++ } + - atoms = gmx_mtop_global_atoms(mtop); ++ /* Read atom positions and charges */ ++ read_tps_conf(ftp2fn(efTPX, NFILE, fnm), title, &top, &ePBC, &x, &v, box, FALSE); ++ atoms = top.atoms; + ++ /* Compute total charge */ + qtot = 0; + for (i = 0; (i < atoms.nr); i++) + { + qtot += atoms.atom[i].q; + } + iqtot = gmx_nint(qtot); + + + if (conc > 0) + { + /* Compute number of ions to be added */ + vol = det(box); + nsalt = gmx_nint(conc*vol*AVOGADRO/1e24); + p_num = abs(nsalt*n_q); + n_num = abs(nsalt*p_q); + } + if (bNeutral) + { + int qdelta = p_num*p_q + n_num*n_q + iqtot; + + /* Check if the system is neutralizable + * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */ + int gcd = greatest_common_divisor(n_q, p_q); + if ((qdelta % gcd) != 0) + { + gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and" + " -pq %d.\n", n_q, p_q); + } + + while (qdelta != 0) + { + while (qdelta < 0) + { + p_num++; + qdelta += p_q; + } + while (qdelta > 0) + { + n_num++; + qdelta += n_q; + } + } + } + + if ((p_num == 0) && (n_num == 0)) + { - if (!bPDB) - { - fprintf(stderr, "No ions to add and no potential to calculate.\n"); - exit(0); - } - nw = 0; - nsa = 0; /* to keep gcc happy */ ++ fprintf(stderr, "No ions to add.\n"); ++ exit(0); + } + else + { + printf("Will try to add %d %s ions and %d %s ions.\n", + p_num, p_name, n_num, n_name); + printf("Select a continuous group of solvent molecules\n"); + get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nwa, &index, &grpname); + for (i = 1; i < nwa; i++) + { + if (index[i] != index[i-1]+1) + { + gmx_fatal(FARGS, "The solvent group %s is not continuous: " + "index[%d]=%d, index[%d]=%d", + grpname, i, index[i-1]+1, i+1, index[i]+1); + } + } + nsa = 1; + while ((nsa < nwa) && + (atoms.atom[index[nsa]].resind == + atoms.atom[index[nsa-1]].resind)) + { + nsa++; + } + if (nwa % nsa) + { + gmx_fatal(FARGS, "Your solvent group size (%d) is not a multiple of %d", + nwa, nsa); + } + nw = nwa/nsa; + fprintf(stderr, "Number of (%d-atomic) solvent molecules: %d\n", nsa, nw); + if (p_num+n_num > nw) + { + gmx_fatal(FARGS, "Not enough solvent for adding ions"); + } + } + + if (opt2bSet("-p", NFILE, fnm)) + { + update_topol(opt2fn("-p", NFILE, fnm), p_num, n_num, p_name, n_name, grpname); + } + + snew(bSet, nw); + snew(repl, nw); + + snew(v, atoms.nr); + snew(atoms.pdbinfo, atoms.nr); + - set_pbc(&pbc, inputrec.ePBC, box); ++ set_pbc(&pbc, ePBC, box); + + /* Now loop over the ions that have to be placed */ - do ++ while (p_num-- > 0) + { - if (!bRandom) - { - calc_pot(fplog, cr, mtop, &inputrec, top, x, fr, &enerd, mdatoms, pot, box, graph); - if (bPDB || debug) - { - char buf[STRLEN]; - - if (debug) - { - sprintf(buf, "%d_%s", p_num+n_num, ftp2fn(efPDB, NFILE, fnm)); - } - else - { - strcpy(buf, ftp2fn(efPDB, NFILE, fnm)); - } - for (i = 0; (i < atoms.nr); i++) - { - atoms.pdbinfo[i].bfac = pot[i]*scale; - } - write_sto_conf(buf, "Potential calculated by genion", - &atoms, x, v, inputrec.ePBC, box); - bPDB = FALSE; - } - } - if ((p_num > 0) && (p_num >= n_num)) - { - insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc, - 1, p_q, p_name, mdatoms, rmin, bRandom, &seed); - p_num--; - } - else if (n_num > 0) - { - insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc, - -1, n_q, n_name, mdatoms, rmin, bRandom, &seed); - n_num--; - } ++ insert_ion(nsa, &nw, bSet, repl, index, x, &pbc, ++ 1, p_q, p_name, &atoms, rmin, &seed); ++ } ++ while (n_num-- > 0) ++ { ++ insert_ion(nsa, &nw, bSet, repl, index, x, &pbc, ++ -1, n_q, n_name, &atoms, rmin, &seed); + } - while (p_num+n_num > 0); + fprintf(stderr, "\n"); + + if (nw) + { + sort_ions(nsa, nw, repl, index, &atoms, x, p_name, n_name); + } + + sfree(atoms.pdbinfo); + atoms.pdbinfo = NULL; - write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *mtop->name, &atoms, x, NULL, - inputrec.ePBC, box); ++ write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *top.name, &atoms, x, NULL, ePBC, ++ box); + + thanx(stderr); + - gmx_log_close(fplog); - + return 0; +} diff --cc src/gromacs/gmxlib/copyrite.c index 7c64b5cdc9,0000000000..fef70c2bc4 mode 100644,000000..100644 --- a/src/gromacs/gmxlib/copyrite.c +++ b/src/gromacs/gmxlib/copyrite.c @@@ -1,702 -1,0 +1,708 @@@ +/* + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * GROningen Mixture of Alchemy and Childrens' Stories + */ +#include "copyrite.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + +#ifdef HAVE_LIBMKL +#include +#endif + +/* This file is completely threadsafe - keep it that way! */ + +#include "gromacs/legacyheaders/futil.h" +#include "gromacs/legacyheaders/macros.h" +#include "gromacs/legacyheaders/random.h" +#include "gromacs/legacyheaders/smalloc.h" +#include "gromacs/legacyheaders/statutil.h" +#include "gromacs/legacyheaders/strdb.h" +#include "gromacs/legacyheaders/string2.h" +#include "gromacs/legacyheaders/vec.h" + +#include "gromacs/fft/fft.h" + +#include "buildinfo.h" + +static gmx_bool be_cool(void) +{ + /* Yes, it is bad to check the environment variable every call, + * but we dont call this routine often, and it avoids using + * a mutex for locking the variable... + */ +#ifdef GMX_COOL_QUOTES + return (getenv("GMX_NO_QUOTES") == NULL); +#else + /*be uncool*/ + return FALSE; +#endif +} + +static void space(FILE *out, int n) +{ + fprintf(out, "%*s", n, ""); +} + +static void sp_print(FILE *out, const char *s) +{ + int slen; + + slen = strlen(s); + space(out, (80-slen)/2); + fprintf(out, "%s\n", s); +} + +static void ster_print(FILE *out, const char *s) +{ + int slen; + char buf[128]; + + snprintf(buf, 128, ":-) %s (-:", s); + slen = strlen(buf); + space(out, (80-slen)/2); + fprintf(out, "%s\n", buf); +} + + +static void pukeit(const char *db, const char *defstring, char *retstring, + int retsize, int *cqnum) +{ + FILE *fp; + char **help; + int i, nhlp; + int seed; + + if (be_cool() && ((fp = low_libopen(db, FALSE)) != NULL)) + { + nhlp = fget_lines(fp, &help); + /* for libraries we can use the low-level close routines */ + ffclose(fp); + seed = time(NULL); + *cqnum = nhlp*rando(&seed); + if (strlen(help[*cqnum]) >= STRLEN) + { + help[*cqnum][STRLEN-1] = '\0'; + } + strncpy(retstring, help[*cqnum], retsize); + for (i = 0; (i < nhlp); i++) + { + sfree(help[i]); + } + sfree(help); + } + else + { + strncpy(retstring, defstring, retsize); + } +} + +void bromacs(char *retstring, int retsize) +{ + int dum; + + pukeit("bromacs.dat", + "Groningen Machine for Chemical Simulation", + retstring, retsize, &dum); +} + +void cool_quote(char *retstring, int retsize, int *cqnum) +{ + char *tmpstr; + char *s, *ptr; + int tmpcq, *p; + + if (cqnum != NULL) + { + p = cqnum; + } + else + { + p = &tmpcq; + } + + /* protect audience from explicit lyrics */ + snew(tmpstr, retsize+1); + pukeit("gurgle.dat", "Thanx for Using GROMACS - Have a Nice Day", + tmpstr, retsize-2, p); + + if ((ptr = strchr(tmpstr, '_')) != NULL) + { + *ptr = '\0'; + ptr++; + sprintf(retstring, "\"%s\" %s", tmpstr, ptr); + } + else + { + strcpy(retstring, tmpstr); + } + sfree(tmpstr); +} + +void CopyRight(FILE *out, const char *szProgram) +{ + static const char * CopyrightText[] = { + "Written by Emile Apol, Rossen Apostolov, Herman J.C. Berendsen,", + "Aldert van Buuren, Pär Bjelkmar, Rudi van Drunen, Anton Feenstra, ", + "Gerrit Groenhof, Peter Kasson, Per Larsson, Pieter Meulenhoff, ", + "Teemu Murtola, Szilard Pall, Sander Pronk, Roland Schulz, ", + "Michael Shirts, Alfons Sijbers, Peter Tieleman,\n", + "Berk Hess, David van der Spoel, and Erik Lindahl.\n", + "Copyright (c) 1991-2000, University of Groningen, The Netherlands.", + "Copyright (c) 2001-2010, The GROMACS development team at", + "Uppsala University & The Royal Institute of Technology, Sweden.", + "check out http://www.gromacs.org for more information.\n" + }; + + static const char * LicenseText[] = { + "This program is free software; you can redistribute it and/or", + "modify it under the terms of the GNU Lesser General Public License", + "as published by the Free Software Foundation; either version 2.1", + "of the License, or (at your option) any later version." + }; + + /* Dont change szProgram arbitrarily - it must be argv[0], i.e. the + * name of a file. Otherwise, we won't be able to find the library dir. + */ ++ +#define NCR (int)asize(CopyrightText) +/* TODO: Is this exception still needed? */ +#ifdef GMX_FAHCORE +#define NLICENSE 0 /*FAH has an exception permission from LGPL to allow digital signatures in Gromacs*/ +#else +#define NLICENSE (int)asize(LicenseText) +#endif + + char buf[256], tmpstr[1024]; + int i; + +#ifdef GMX_FAHCORE + set_program_name("Gromacs"); +#else + set_program_name(szProgram); +#endif + + ster_print(out, "G R O M A C S"); + fprintf(out, "\n"); + + bromacs(tmpstr, 1023); + sp_print(out, tmpstr); + fprintf(out, "\n"); + + ster_print(out, GromacsVersion()); + fprintf(out, "\n"); + ++ if (getenv("GMX_NO_CREDITS")) ++ { ++ return; ++ } ++ + /* fprintf(out,"\n");*/ + + /* sp_print(out,"PLEASE NOTE: THIS IS A BETA VERSION\n"); + + fprintf(out,"\n"); */ + + for (i = 0; (i < NCR); i++) + { + sp_print(out, CopyrightText[i]); + } + for (i = 0; (i < NLICENSE); i++) + { + sp_print(out, LicenseText[i]); + } + + fprintf(out, "\n"); + + snprintf(buf, 256, "%s", Program()); +#ifdef GMX_DOUBLE + strcat(buf, " (double precision)"); +#endif + ster_print(out, buf); + fprintf(out, "\n"); +} + + +void thanx(FILE *fp) +{ + char cq[1024]; + int cqnum; + + /* protect the audience from suggestive discussions */ + cool_quote(cq, 1023, &cqnum); + + if (be_cool()) + { + fprintf(fp, "\ngcq#%d: %s\n\n", cqnum, cq); + } + else + { + fprintf(fp, "\n%s\n\n", cq); + } +} + +typedef struct { + const char *key; + const char *author; + const char *title; + const char *journal; + int volume, year; + const char *pages; +} t_citerec; + +void please_cite(FILE *fp, const char *key) +{ + static const t_citerec citedb[] = { + { "Allen1987a", + "M. P. Allen and D. J. Tildesley", + "Computer simulation of liquids", + "Oxford Science Publications", + 1, 1987, "1" }, + { "Berendsen95a", + "H. J. C. Berendsen, D. van der Spoel and R. van Drunen", + "GROMACS: A message-passing parallel molecular dynamics implementation", + "Comp. Phys. Comm.", + 91, 1995, "43-56" }, + { "Berendsen84a", + "H. J. C. Berendsen, J. P. M. Postma, A. DiNola and J. R. Haak", + "Molecular dynamics with coupling to an external bath", + "J. Chem. Phys.", + 81, 1984, "3684-3690" }, + { "Ryckaert77a", + "J. P. Ryckaert and G. Ciccotti and H. J. C. Berendsen", + "Numerical Integration of the Cartesian Equations of Motion of a System with Constraints; Molecular Dynamics of n-Alkanes", + "J. Comp. Phys.", + 23, 1977, "327-341" }, + { "Miyamoto92a", + "S. Miyamoto and P. A. Kollman", + "SETTLE: An Analytical Version of the SHAKE and RATTLE Algorithms for Rigid Water Models", + "J. Comp. Chem.", + 13, 1992, "952-962" }, + { "Cromer1968a", + "D. T. Cromer & J. B. Mann", + "X-ray scattering factors computed from numerical Hartree-Fock wave functions", + "Acta Cryst. A", + 24, 1968, "321" }, + { "Barth95a", + "E. Barth and K. Kuczera and B. Leimkuhler and R. D. Skeel", + "Algorithms for Constrained Molecular Dynamics", + "J. Comp. Chem.", + 16, 1995, "1192-1209" }, + { "Essmann95a", + "U. Essmann, L. Perera, M. L. Berkowitz, T. Darden, H. Lee and L. G. Pedersen ", + "A smooth particle mesh Ewald method", + "J. Chem. Phys.", + 103, 1995, "8577-8592" }, + { "Torda89a", + "A. E. Torda and R. M. Scheek and W. F. van Gunsteren", + "Time-dependent distance restraints in molecular dynamics simulations", + "Chem. Phys. Lett.", + 157, 1989, "289-294" }, + { "Tironi95a", + "I. G. Tironi and R. Sperb and P. E. Smith and W. F. van Gunsteren", + "Generalized reaction field method for molecular dynamics simulations", + "J. Chem. Phys", + 102, 1995, "5451-5459" }, + { "Hess97a", + "B. Hess and H. Bekker and H. J. C. Berendsen and J. G. E. M. Fraaije", + "LINCS: A Linear Constraint Solver for molecular simulations", + "J. Comp. Chem.", + 18, 1997, "1463-1472" }, + { "Hess2008a", + "B. Hess", + "P-LINCS: A Parallel Linear Constraint Solver for molecular simulation", + "J. Chem. Theory Comput.", + 4, 2008, "116-122" }, + { "Hess2008b", + "B. Hess and C. Kutzner and D. van der Spoel and E. Lindahl", + "GROMACS 4: Algorithms for highly efficient, load-balanced, and scalable molecular simulation", + "J. Chem. Theory Comput.", + 4, 2008, "435-447" }, + { "Hub2010", + "J. S. Hub, B. L. de Groot and D. van der Spoel", + "g_wham - A free weighted histogram analysis implementation including robust error and autocorrelation estimates", + "J. Chem. Theory Comput.", + 6, 2010, "3713-3720"}, + { "In-Chul99a", + "Y. In-Chul and M. L. Berkowitz", + "Ewald summation for systems with slab geometry", + "J. Chem. Phys.", + 111, 1999, "3155-3162" }, + { "DeGroot97a", + "B. L. de Groot and D. M. F. van Aalten and R. M. Scheek and A. Amadei and G. Vriend and H. J. C. Berendsen", + "Prediction of Protein Conformational Freedom From Distance Constrains", + "Proteins", + 29, 1997, "240-251" }, + { "Spoel98a", + "D. van der Spoel and P. J. van Maaren and H. J. C. Berendsen", + "A systematic study of water models for molecular simulation. Derivation of models optimized for use with a reaction-field.", + "J. Chem. Phys.", + 108, 1998, "10220-10230" }, + { "Wishart98a", + "D. S. Wishart and A. M. Nip", + "Protein Chemical Shift Analysis: A Practical Guide", + "Biochem. Cell Biol.", + 76, 1998, "153-163" }, + { "Maiorov95", + "V. N. Maiorov and G. M. Crippen", + "Size-Independent Comparison of Protein Three-Dimensional Structures", + "PROTEINS: Struct. Funct. Gen.", + 22, 1995, "273-283" }, + { "Feenstra99", + "K. A. Feenstra and B. Hess and H. J. C. Berendsen", + "Improving Efficiency of Large Time-scale Molecular Dynamics Simulations of Hydrogen-rich Systems", + "J. Comput. Chem.", + 20, 1999, "786-798" }, + { "Timneanu2004a", + "N. Timneanu and C. Caleman and J. Hajdu and D. van der Spoel", + "Auger Electron Cascades in Water and Ice", + "Chem. Phys.", + 299, 2004, "277-283" }, + { "Pascal2011a", + "T. A. Pascal and S. T. Lin and W. A. Goddard III", + "Thermodynamics of liquids: standard molar entropies and heat capacities of common solvents from 2PT molecular dynamics", + "Phys. Chem. Chem. Phys.", + 13, 2011, "169-181" }, + { "Caleman2011b", + "C. Caleman and P. J. van Maaren and M. Hong and J. S. Hub and L. T. da Costa and D. van der Spoel", + "Force Field Benchmark of Organic Liquids: Density, Enthalpy of Vaporization, Heat Capacities, Surface Tension, Isothermal Compressibility, Volumetric Expansion Coefficient, and Dielectric Constant", + "J. Chem. Theo. Comp.", + 8, 2012, "61" }, + { "Lindahl2001a", + "E. Lindahl and B. Hess and D. van der Spoel", + "GROMACS 3.0: A package for molecular simulation and trajectory analysis", + "J. Mol. Mod.", + 7, 2001, "306-317" }, + { "Wang2001a", + "J. Wang and W. Wang and S. Huo and M. Lee and P. A. Kollman", + "Solvation model based on weighted solvent accessible surface area", + "J. Phys. Chem. B", + 105, 2001, "5055-5067" }, + { "Eisenberg86a", + "D. Eisenberg and A. D. McLachlan", + "Solvation energy in protein folding and binding", + "Nature", + 319, 1986, "199-203" }, + { "Bondi1964a", + "A. Bondi", + "van der Waals Volumes and Radii", + "J. Phys. Chem.", + 68, 1964, "441-451" }, + { "Eisenhaber95", + "Frank Eisenhaber and Philip Lijnzaad and Patrick Argos and Chris Sander and Michael Scharf", + "The Double Cube Lattice Method: Efficient Approaches to Numerical Integration of Surface Area and Volume and to Dot Surface Contouring of Molecular Assemblies", + "J. Comp. Chem.", + 16, 1995, "273-284" }, + { "Hess2002", + "B. Hess, H. Saint-Martin and H.J.C. Berendsen", + "Flexible constraints: an adiabatic treatment of quantum degrees of freedom, with application to the flexible and polarizable MCDHO model for water", + "J. Chem. Phys.", + 116, 2002, "9602-9610" }, + { "Hetenyi2002b", + "Csaba Hetenyi and David van der Spoel", + "Efficient docking of peptides to proteins without prior knowledge of the binding site.", + "Prot. Sci.", + 11, 2002, "1729-1737" }, + { "Hess2003", + "B. Hess and R.M. Scheek", + "Orientation restraints in molecular dynamics simulations using time and ensemble averaging", + "J. Magn. Res.", + 164, 2003, "19-27" }, + { "Rappe1991a", + "A. K. Rappe and W. A. Goddard III", + "Charge Equillibration for Molecular Dynamics Simulations", + "J. Phys. Chem.", + 95, 1991, "3358-3363" }, + { "Mu2005a", + "Y. Mu, P. H. Nguyen and G. Stock", + "Energy landscape of a small peptide revelaed by dihedral angle principal component analysis", + "Prot. Struct. Funct. Bioinf.", + 58, 2005, "45-52" }, + { "Okabe2001a", + "T. Okabe and M. Kawata and Y. Okamoto and M. Mikami", + "Replica-exchange {M}onte {C}arlo method for the isobaric-isothermal ensemble", + "Chem. Phys. Lett.", + 335, 2001, "435-439" }, + { "Hukushima96a", + "K. Hukushima and K. Nemoto", + "Exchange Monte Carlo Method and Application to Spin Glass Simulations", + "J. Phys. Soc. Jpn.", + 65, 1996, "1604-1608" }, + { "Tropp80a", + "J. Tropp", + "Dipolar Relaxation and Nuclear Overhauser effects in nonrigid molecules: The effect of fluctuating internuclear distances", + "J. Chem. Phys.", + 72, 1980, "6035-6043" }, + { "Bultinck2002a", + "P. Bultinck and W. Langenaeker and P. Lahorte and F. De Proft and P. Geerlings and M. Waroquier and J. P. Tollenaere", + "The electronegativity equalization method I: Parametrization and validation for atomic charge calculations", + "J. Phys. Chem. A", + 106, 2002, "7887-7894" }, + { "Yang2006b", + "Q. Y. Yang and K. A. Sharp", + "Atomic charge parameters for the finite difference Poisson-Boltzmann method using electronegativity neutralization", + "J. Chem. Theory Comput.", + 2, 2006, "1152-1167" }, + { "Spoel2005a", + "D. van der Spoel, E. Lindahl, B. Hess, G. Groenhof, A. E. Mark and H. J. C. Berendsen", + "GROMACS: Fast, Flexible and Free", + "J. Comp. Chem.", + 26, 2005, "1701-1719" }, + { "Spoel2006b", + "D. van der Spoel, P. J. van Maaren, P. Larsson and N. Timneanu", + "Thermodynamics of hydrogen bonding in hydrophilic and hydrophobic media", + "J. Phys. Chem. B", + 110, 2006, "4393-4398" }, + { "Spoel2006d", + "D. van der Spoel and M. M. Seibert", + "Protein folding kinetics and thermodynamics from atomistic simulations", + "Phys. Rev. Letters", + 96, 2006, "238102" }, + { "Palmer94a", + "B. J. Palmer", + "Transverse-current autocorrelation-function calculations of the shear viscosity for molecular liquids", + "Phys. Rev. E", + 49, 1994, "359-366" }, + { "Bussi2007a", + "G. Bussi, D. Donadio and M. Parrinello", + "Canonical sampling through velocity rescaling", + "J. Chem. Phys.", + 126, 2007, "014101" }, + { "Hub2006", + "J. S. Hub and B. L. de Groot", + "Does CO2 permeate through Aquaporin-1?", + "Biophys. J.", + 91, 2006, "842-848" }, + { "Hub2008", + "J. S. Hub and B. L. de Groot", + "Mechanism of selectivity in aquaporins and aquaglyceroporins", + "PNAS", + 105, 2008, "1198-1203" }, + { "Friedrich2009", + "M. S. Friedrichs, P. Eastman, V. Vaidyanathan, M. Houston, S. LeGrand, A. L. Beberg, D. L. Ensign, C. M. Bruns, and V. S. Pande", + "Accelerating Molecular Dynamic Simulation on Graphics Processing Units", + "J. Comp. Chem.", + 30, 2009, "864-872" }, + { "Engin2010", + "O. Engin, A. Villa, M. Sayar and B. Hess", + "Driving Forces for Adsorption of Amphiphilic Peptides to Air-Water Interface", + "J. Phys. Chem. B", + 114, 2010, "11093" }, + { "Fritsch12", + "S. Fritsch, C. Junghans and K. Kremer", + "Adaptive molecular simulation study on structure formation of toluene around C60 using Gromacs", + "J. Chem. Theo. Comp.", + 8, 2012, "398" }, + { "Junghans10", + "C. Junghans and S. Poblete", + "A reference implementation of the adaptive resolution scheme in ESPResSo", + "Comp. Phys. Comm.", + 181, 2010, "1449" }, + { "Wang2010", + "H. Wang, F. Dommert, C.Holm", + "Optimizing working parameters of the smooth particle mesh Ewald algorithm in terms of accuracy and efficiency", + "J. Chem. Phys. B", + 133, 2010, "034117" }, + { "Sugita1999a", + "Y. Sugita, Y. Okamoto", + "Replica-exchange molecular dynamics method for protein folding", + "Chem. Phys. Lett.", + 314, 1999, "141-151" }, + { "Kutzner2011", + "C. Kutzner and J. Czub and H. Grubmuller", + "Keep it Flexible: Driving Macromolecular Rotary Motions in Atomistic Simulations with GROMACS", + "J. Chem. Theory Comput.", + 7, 2011, "1381-1393" }, + { "Hoefling2011", + "M. Hoefling, N. Lima, D. Haenni, C.A.M. Seidel, B. Schuler, H. Grubmuller", + "Structural Heterogeneity and Quantitative FRET Efficiency Distributions of Polyprolines through a Hybrid Atomistic Simulation and Monte Carlo Approach", + "PLoS ONE", + 6, 2011, "e19791" }, + { "Hockney1988", + "R. W. Hockney and J. W. Eastwood", + "Computer simulation using particles", + "IOP, Bristol", + 1, 1988, "1" }, + { "Ballenegger2012", + "V. Ballenegger, J.J. Cerda, and C. Holm", + "How to Convert SPME to P3M: Influence Functions and Error Estimates", + "J. Chem. Theory Comput.", + 8, 2012, "936-947" }, + { "Garmay2012", + "Garmay Yu, Shvetsov A, Karelov D, Lebedev D, Radulescu A, Petukhov M, Isaev-Ivanov V", + "Correlated motion of protein subdomains and large-scale conformational flexibility of RecA protein filament", + "Journal of Physics: Conference Series", + 340, 2012, "012094" } + }; +#define NSTR (int)asize(citedb) + + int j, index; + char *author; + char *title; +#define LINE_WIDTH 79 + + if (fp == NULL) + { + return; + } + + for (index = 0; (index < NSTR) && (strcmp(citedb[index].key, key) != 0); index++) + { + ; + } + + fprintf(fp, "\n++++ PLEASE READ AND CITE THE FOLLOWING REFERENCE ++++\n"); + if (index < NSTR) + { + /* Insert newlines */ + author = wrap_lines(citedb[index].author, LINE_WIDTH, 0, FALSE); + title = wrap_lines(citedb[index].title, LINE_WIDTH, 0, FALSE); + fprintf(fp, "%s\n%s\n%s %d (%d) pp. %s\n", + author, title, citedb[index].journal, + citedb[index].volume, citedb[index].year, + citedb[index].pages); + sfree(author); + sfree(title); + } + else + { + fprintf(fp, "Entry %s not found in citation database\n", key); + } + fprintf(fp, "-------- -------- --- Thank You --- -------- --------\n\n"); + fflush(fp); +} + +#ifdef GMX_GIT_VERSION_INFO +/* Version information generated at compile time. */ +#include "gromacs/utility/gitversion.h" +#else +/* Fall back to statically defined version. */ +static const char _gmx_ver_string[] = "VERSION " VERSION; +#endif + +const char *GromacsVersion() +{ + return _gmx_ver_string; +} + +void gmx_print_version_info_gpu(FILE *fp); + +void gmx_print_version_info(FILE *fp) +{ + fprintf(fp, "Gromacs version: %s\n", _gmx_ver_string); +#ifdef GMX_GIT_VERSION_INFO + fprintf(fp, "GIT SHA1 hash: %s\n", _gmx_full_git_hash); + /* Only print out the branch information if present. + * The generating script checks whether the branch point actually + * coincides with the hash reported above, and produces an empty string + * in such cases. */ + if (_gmx_central_base_hash[0] != 0) + { + fprintf(fp, "Branched from: %s\n", _gmx_central_base_hash); + } +#endif + +#ifdef GMX_DOUBLE + fprintf(fp, "Precision: double\n"); +#else + fprintf(fp, "Precision: single\n"); +#endif + fprintf(fp, "Memory model: %lu bit\n", 8*sizeof(void *)); + +#ifdef GMX_THREAD_MPI + fprintf(fp, "MPI library: thread_mpi\n"); +#elif defined(GMX_MPI) + fprintf(fp, "MPI library: MPI\n"); +#else + fprintf(fp, "MPI library: none\n"); +#endif +#ifdef GMX_OPENMP + fprintf(fp, "OpenMP support: enabled\n"); +#else + fprintf(fp, "OpenMP support: disabled\n"); +#endif +#ifdef GMX_GPU + fprintf(fp, "GPU support: enabled\n"); +#else + fprintf(fp, "GPU support: disabled\n"); +#endif + /* A preprocessor trick to avoid duplicating logic from vec.h */ +#define gmx_stringify2(x) #x +#define gmx_stringify(x) gmx_stringify2(x) + fprintf(fp, "invsqrt routine: %s\n", gmx_stringify(gmx_invsqrt(x))); + fprintf(fp, "CPU acceleration: %s\n", GMX_CPU_ACCELERATION_STRING); + + fprintf(fp, "FFT library: %s\n", gmx_fft_get_version_info()); +#ifdef GMX_LARGEFILES + fprintf(fp, "Large file support: enabled\n"); +#else + fprintf(fp, "Large file support: disabled\n"); +#endif +#ifdef HAVE_RDTSCP + fprintf(fp, "RDTSCP usage: enabled\n"); +#else + fprintf(fp, "RDTSCP usage: disabled\n"); +#endif + + fprintf(fp, "Built on: %s\n", BUILD_TIME); + fprintf(fp, "Built by: %s\n", BUILD_USER); + fprintf(fp, "Build OS/arch: %s\n", BUILD_HOST); + fprintf(fp, "Build CPU vendor: %s\n", BUILD_CPU_VENDOR); + fprintf(fp, "Build CPU brand: %s\n", BUILD_CPU_BRAND); + fprintf(fp, "Build CPU family: %d Model: %d Stepping: %d\n", + BUILD_CPU_FAMILY, BUILD_CPU_MODEL, BUILD_CPU_STEPPING); + /* TODO: The below strings can be quite long, so it would be nice to wrap + * them. Can wait for later, as the master branch has ready code to do all + * that. */ + fprintf(fp, "Build CPU features: %s\n", BUILD_CPU_FEATURES); + fprintf(fp, "C compiler: %s\n", BUILD_C_COMPILER); + fprintf(fp, "C compiler flags: %s\n", BUILD_CFLAGS); + fprintf(fp, "C++ compiler: %s\n", BUILD_CXX_COMPILER); + fprintf(fp, "C++ compiler flags: %s\n", BUILD_CXXFLAGS); +#ifdef HAVE_LIBMKL + /* MKL might be used for LAPACK/BLAS even if FFTs use FFTW, so keep it separate */ + fprintf(fp, "Linked with Intel MKL version %d.%d.%d.\n", + __INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__); +#endif +#ifdef GMX_GPU + gmx_print_version_info_gpu(fp); +#endif + +} diff --cc src/gromacs/gmxlib/gmx_cpuid.c index 93a2e27181,0000000000..c12ce798a4 mode 100644,000000..100644 --- a/src/gromacs/gmxlib/gmx_cpuid.c +++ b/src/gromacs/gmxlib/gmx_cpuid.c @@@ -1,1172 -1,0 +1,1186 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This file is part of GROMACS. + * Copyright (c) 2012- + * + * Written by the Gromacs development team under coordination of + * David van der Spoel, Berk Hess, and Erik Lindahl. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifdef HAVE_SCHED_H +#define _GNU_SOURCE +#include +#endif + +#include +#include +#include +#include +#ifdef _MSC_VER +/* MSVC definition for __cpuid() */ +#include +/* sysinfo functions */ +#include +#endif +#ifdef HAVE_UNISTD_H +/* sysconf() definition */ +#include +#endif + +#include "gmx_cpuid.h" + + + +/* For convenience, and to enable configure-time invocation, we keep all architectures + * in a single file, but to avoid repeated ifdefs we set the overall architecture here. + */ +#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64) +/* OK, it is x86, but can we execute cpuid? */ +#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729))) +# define GMX_CPUID_X86 +#endif +#endif + +/* Global constant character strings corresponding to our enumerated types */ +const char * +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] = +{ + "CannotDetect", + "Unknown", + "GenuineIntel", + "AuthenticAMD", + "Fujitsu", + "IBM" +}; + +const char * +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] = +{ + "CannotDetect", + "aes", + "apic", + "avx", + "avx2", + "clfsh", + "cmov", + "cx8", + "cx16", + "f16c", + "fma", + "fma4", + "htt", + "lahf_lm", + "misalignsse", + "mmx", + "msr", + "nonstop_tsc", + "pcid", + "pclmuldq", + "pdcm", + "pdpe1gb", + "popcnt", + "pse", + "rdrnd", + "rdtscp", + "sse2", + "sse3", + "sse4a", + "sse4.1", + "sse4.2", + "ssse3", + "tdt", + "x2apic", + "xop" +}; + +const char * +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] = +{ + "CannotDetect", + "None", + "SSE2", + "SSE4.1", + "AVX_128_FMA", + "AVX_256", + "Sparc64 HPC-ACE" +}; + +/* Max length of brand string */ +#define GMX_CPUID_BRAND_MAXLEN 256 + + +/* Contents of the abstract datatype */ +struct gmx_cpuid +{ + enum gmx_cpuid_vendor vendor; + char brand[GMX_CPUID_BRAND_MAXLEN]; + int family; + int model; + int stepping; + /* Not using gmx_bool here, since this file must be possible to compile without simple.h */ + char feature[GMX_CPUID_NFEATURES]; + + /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between + * operating systems and sometimes even settings. For most other architectures you can likely just check + * the documentation and then write static information to these arrays rather than detecting on-the-fly. + */ + int have_cpu_topology; + int nproc; /* total number of logical processors from OS */ + int npackages; + int ncores_per_package; + int nhwthreads_per_core; + int * package_id; + int * core_id; /* Local core id in each package */ + int * hwthread_id; /* Local hwthread id in each core */ + int * locality_order; /* Processor indices sorted in locality order */ +}; + + +/* Simple routines to access the data structure. The initialization routine is + * further down since that needs to call other static routines in this file. + */ +enum gmx_cpuid_vendor +gmx_cpuid_vendor (gmx_cpuid_t cpuid) +{ + return cpuid->vendor; +} + + +const char * +gmx_cpuid_brand (gmx_cpuid_t cpuid) +{ + return cpuid->brand; +} + +int +gmx_cpuid_family (gmx_cpuid_t cpuid) +{ + return cpuid->family; +} + +int +gmx_cpuid_model (gmx_cpuid_t cpuid) +{ + return cpuid->model; +} + +int +gmx_cpuid_stepping (gmx_cpuid_t cpuid) +{ + return cpuid->stepping; +} + +int +gmx_cpuid_feature (gmx_cpuid_t cpuid, + enum gmx_cpuid_feature feature) +{ + return (cpuid->feature[feature] != 0); +} + + + + +/* What type of acceleration was compiled in, if any? + * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for + * AVX too, so it is important that they appear last in the list. + */ +#ifdef GMX_X86_AVX_256 +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256; +#elif defined GMX_X86_AVX_128_FMA +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA; +#elif defined GMX_X86_SSE4_1 +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1; +#elif defined GMX_X86_SSE2 +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2; +#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE; +#else +static const +enum gmx_cpuid_acceleration + compiled_acc = GMX_CPUID_ACCELERATION_NONE; +#endif + + +#ifdef GMX_CPUID_X86 + +/* Execute CPUID on x86 class CPUs. level sets function to exec, and the + * contents of register output is returned. See Intel/AMD docs for details. + * + * This version supports extended information where we can also have an input + * value in the ecx register. This is ignored for most levels, but some of them + * (e.g. level 0xB on Intel) use it. + */ +static int +execute_x86cpuid(unsigned int level, + unsigned int ecxval, + unsigned int * eax, + unsigned int * ebx, + unsigned int * ecx, + unsigned int * edx) +{ + int rc = 0; + + /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2) + * if the compiler handles GNU-style inline assembly. + */ + +#if (defined _MSC_VER) + int CPUInfo[4]; + +#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729) + /* MSVC 9.0 SP1 or later */ + __cpuidex(CPUInfo, level, ecxval); + rc = 0; +#else + __cpuid(CPUInfo, level); + /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */ + rc = (ecxval > 0) ? -1 : 0; +#endif + *eax = CPUInfo[0]; + *ebx = CPUInfo[1]; + *ecx = CPUInfo[2]; + *edx = CPUInfo[3]; + +#elif (defined GMX_X86_GCC_INLINE_ASM) + /* for now this means GMX_X86_GCC_INLINE_ASM should be defined, + * but there might be more options added in the future. + */ + *eax = level; + *ecx = ecxval; + *ebx = 0; + *edx = 0; +#if defined(__i386__) && defined(__PIC__) + /* Avoid clobbering the global offset table in 32-bit pic code (ebx register) */ + __asm__ __volatile__ ("xchgl %%ebx, %1 \n\t" + "cpuid \n\t" + "xchgl %%ebx, %1 \n\t" + : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx)); +#else + /* i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want :-) */ + __asm__ __volatile__ ("cpuid \n\t" + : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx)); +#endif + rc = 0; +#else + /* Death and horror! + * Apparently this is an x86 platform where we don't know how to call cpuid. + * + * This is REALLY bad, since we will lose all Gromacs acceleration. + */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + + rc = -1; +#endif + return rc; +} + + +/* Identify CPU features common to Intel & AMD - mainly brand string, + * version and some features. Vendor has already been detected outside this. + */ +static int +cpuid_check_common_x86(gmx_cpuid_t cpuid) +{ + int fn, max_stdfn, max_extfn; + unsigned int eax, ebx, ecx, edx; + char str[GMX_CPUID_BRAND_MAXLEN]; + char * p; + + /* Find largest standard/extended function input value */ + execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx); + max_stdfn = eax; + execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx); + max_extfn = eax; + + p = str; + if (max_extfn >= 0x80000005) + { + /* Get CPU brand string */ + for (fn = 0x80000002; fn < 0x80000005; fn++) + { + execute_x86cpuid(fn, 0, &eax, &ebx, &ecx, &edx); + memcpy(p, &eax, 4); + memcpy(p+4, &ebx, 4); + memcpy(p+8, &ecx, 4); + memcpy(p+12, &edx, 4); + p += 16; + } + *p = '\0'; + + /* Remove empty initial space */ + p = str; + while (isspace(*(p))) + { + p++; + } + strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN); + } + else + { + strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN); + } + + /* Find basic CPU properties */ + if (max_stdfn >= 1) + { + execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx); + + cpuid->family = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8); + /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */ + cpuid->model = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4); + cpuid->stepping = (eax & 0x0000000F); + + /* Feature flags common to AMD and intel */ + cpuid->feature[GMX_CPUID_FEATURE_X86_SSE3] = (ecx & (1 << 0)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_PCLMULDQ] = (ecx & (1 << 1)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_SSSE3] = (ecx & (1 << 9)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_FMA] = (ecx & (1 << 12)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_CX16] = (ecx & (1 << 13)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_1] = (ecx & (1 << 19)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_2] = (ecx & (1 << 20)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_POPCNT] = (ecx & (1 << 23)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_AES] = (ecx & (1 << 25)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_AVX] = (ecx & (1 << 28)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_F16C] = (ecx & (1 << 29)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_RDRND] = (ecx & (1 << 30)) != 0; + + cpuid->feature[GMX_CPUID_FEATURE_X86_PSE] = (edx & (1 << 3)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_MSR] = (edx & (1 << 5)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_CX8] = (edx & (1 << 8)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_APIC] = (edx & (1 << 9)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_CMOV] = (edx & (1 << 15)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_CLFSH] = (edx & (1 << 19)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_MMX] = (edx & (1 << 23)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_SSE2] = (edx & (1 << 26)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = (edx & (1 << 28)) != 0; + } + else + { + cpuid->family = -1; + cpuid->model = -1; + cpuid->stepping = -1; + } + + if (max_extfn >= 0x80000001) + { + execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx); + cpuid->feature[GMX_CPUID_FEATURE_X86_LAHF_LM] = (ecx & (1 << 0)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_PDPE1GB] = (edx & (1 << 26)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_RDTSCP] = (edx & (1 << 27)) != 0; + } + + if (max_extfn >= 0x80000007) + { + execute_x86cpuid(0x80000007, 0, &eax, &ebx, &ecx, &edx); + cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC] = (edx & (1 << 8)) != 0; + } + return 0; +} + +/* This routine returns the number of unique different elements found in the array, + * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2} + * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the + * number of unique elements. + */ +static int +cpuid_renumber_elements(int *data, int n) +{ + int *unique; + int i, j, nunique, found; + + unique = malloc(sizeof(int)*n); + + nunique = 0; + for (i = 0; i < n; i++) + { + for (j = 0, found = 0; j < nunique && !found; j++) + { + found = (data[i] == unique[j]); + } + if (!found) + { + /* Insert in sorted order! */ + for (j = nunique++; j > 0 && unique[j-1] > data[i]; j--) + { + unique[j] = unique[j-1]; + } + unique[j] = data[i]; + } + } + /* renumber */ + for (i = 0; i < n; i++) + { + for (j = 0; j < nunique; j++) + { + if (data[i] == unique[j]) + { + data[i] = j; + } + } + } + return nunique; +} + +/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask... + * + * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned + * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all + * we know is that the part for each thread/core/package is unique, and how many bits are + * reserved for that part. + * This routine does internal renumbering so we get continuous indices, and also + * decodes the actual number of packages,cores-per-package and hwthreads-per-core. ++ * Returns: 0 on success, non-zero on failure. + */ - static void ++static int +cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid, int *apic_id, int core_bits, int hwthread_bits) +{ + int i, idx; + int hwthread_mask, core_mask_after_shift; + + cpuid->hwthread_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->core_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->package_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->locality_order = malloc(sizeof(int)*cpuid->nproc); + + hwthread_mask = (1 << hwthread_bits) - 1; + core_mask_after_shift = (1 << core_bits) - 1; + + for (i = 0; i < cpuid->nproc; i++) + { + cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask; + cpuid->core_id[i] = (apic_id[i] >> hwthread_bits) & core_mask_after_shift; + cpuid->package_id[i] = apic_id[i] >> (core_bits + hwthread_bits); + } + + cpuid->npackages = cpuid_renumber_elements(cpuid->package_id, cpuid->nproc); + cpuid->ncores_per_package = cpuid_renumber_elements(cpuid->core_id, cpuid->nproc); + cpuid->nhwthreads_per_core = cpuid_renumber_elements(cpuid->hwthread_id, cpuid->nproc); + ++ /* now check for consistency */ ++ if ( (cpuid->npackages * cpuid->ncores_per_package * ++ cpuid->nhwthreads_per_core) != cpuid->nproc ) ++ { ++ /* the packages/cores-per-package/hwthreads-per-core counts are ++ inconsistent. */ ++ return -1; ++ } ++ + /* Create a locality order array, i.e. first all resources in package0, which in turn + * are sorted so we first have all resources in core0, where threads are sorted in order, etc. + */ ++ + for (i = 0; i < cpuid->nproc; i++) + { + idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i]; + cpuid->locality_order[idx] = i; + } ++ return 0; +} + + +/* Detection of AMD-specific CPU features */ +static int +cpuid_check_amd_x86(gmx_cpuid_t cpuid) +{ - int max_stdfn, max_extfn; ++ int max_stdfn, max_extfn, ret; + unsigned int eax, ebx, ecx, edx; + int hwthread_bits, core_bits; + int * apic_id; + + cpuid_check_common_x86(cpuid); + + execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx); + max_stdfn = eax; + + execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx); + max_extfn = eax; + + if (max_extfn >= 0x80000001) + { + execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx); + + cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4A] = (ecx & (1 << 6)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_MISALIGNSSE] = (ecx & (1 << 7)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_XOP] = (ecx & (1 << 11)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4] = (ecx & (1 << 16)) != 0; + } + + /* Query APIC information on AMD */ + if (max_extfn >= 0x80000008) + { +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__) + /* Linux */ + unsigned int i; + cpu_set_t cpuset, save_cpuset; + cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN); + apic_id = malloc(sizeof(int)*cpuid->nproc); + sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset); + /* Get APIC id from each core */ + CPU_ZERO(&cpuset); + for (i = 0; i < cpuid->nproc; i++) + { + CPU_SET(i, &cpuset); + sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); + execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx); + apic_id[i] = ebx >> 24; + CPU_CLR(i, &cpuset); + } + /* Reset affinity to the value it had when calling this routine */ + sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset); +#define CPUID_HAVE_APIC +#elif defined GMX_NATIVE_WINDOWS + /* Windows */ + DWORD_PTR i; + SYSTEM_INFO sysinfo; + unsigned int save_affinity, affinity; + GetSystemInfo( &sysinfo ); + cpuid->nproc = sysinfo.dwNumberOfProcessors; + apic_id = malloc(sizeof(int)*cpuid->nproc); + /* Get previous affinity mask */ + save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1); + for (i = 0; i < cpuid->nproc; i++) + { + SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<> 24; + } + SetThreadAffinityMask(GetCurrentThread(), save_affinity); +#define CPUID_HAVE_APIC +#endif +#ifdef CPUID_HAVE_APIC + /* AMD does not support SMT yet - there are no hwthread bits in apic ID */ + hwthread_bits = 0; + /* Get number of core bits in apic ID - try modern extended method first */ + execute_x86cpuid(0x80000008, 0, &eax, &ebx, &ecx, &edx); + core_bits = (ecx >> 12) & 0xf; + if (core_bits == 0) + { + /* Legacy method for old single/dual core AMD CPUs */ + int i = ecx & 0xF; + for (core_bits = 0; (i>>core_bits) > 0; core_bits++) + { + ; + } + } - cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, hwthread_bits); - cpuid->have_cpu_topology = 1; ++ ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, ++ hwthread_bits); ++ cpuid->have_cpu_topology = (ret == 0); +#endif + } + return 0; +} + +/* Detection of Intel-specific CPU features */ +static int +cpuid_check_intel_x86(gmx_cpuid_t cpuid) +{ - unsigned int max_stdfn, max_extfn; ++ unsigned int max_stdfn, max_extfn, ret; + unsigned int eax, ebx, ecx, edx; + unsigned int max_logical_cores, max_physical_cores; + int hwthread_bits, core_bits; + int * apic_id; + + cpuid_check_common_x86(cpuid); + + execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx); + max_stdfn = eax; + + execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx); + max_extfn = eax; + + if (max_stdfn >= 1) + { + execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx); + cpuid->feature[GMX_CPUID_FEATURE_X86_PDCM] = (ecx & (1 << 15)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_PCID] = (ecx & (1 << 17)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_X2APIC] = (ecx & (1 << 21)) != 0; + cpuid->feature[GMX_CPUID_FEATURE_X86_TDT] = (ecx & (1 << 24)) != 0; + } + + if (max_stdfn >= 7) + { + execute_x86cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); + cpuid->feature[GMX_CPUID_FEATURE_X86_AVX2] = (ebx & (1 << 5)) != 0; + } + + /* Check whether Hyper-Threading is enabled, not only supported */ + if (cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] && max_stdfn >= 4) + { + execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx); + max_logical_cores = (ebx >> 16) & 0x0FF; + execute_x86cpuid(0x4, 0, &eax, &ebx, &ecx, &edx); + max_physical_cores = ((eax >> 26) & 0x3F) + 1; + + /* Clear HTT flag if we only have 1 logical core per physical */ + if (max_logical_cores/max_physical_cores < 2) + { + cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0; + } + } + + if (max_stdfn >= 0xB) + { + /* Query x2 APIC information from cores */ +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__) + /* Linux */ + unsigned int i; + cpu_set_t cpuset, save_cpuset; + cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN); + apic_id = malloc(sizeof(int)*cpuid->nproc); + sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset); + /* Get x2APIC ID from each hardware thread */ + CPU_ZERO(&cpuset); + for (i = 0; i < cpuid->nproc; i++) + { + CPU_SET(i, &cpuset); + sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); + execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx); + apic_id[i] = edx; + CPU_CLR(i, &cpuset); + } + /* Reset affinity to the value it had when calling this routine */ + sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset); +#define CPUID_HAVE_APIC +#elif defined GMX_NATIVE_WINDOWS + /* Windows */ + DWORD_PTR i; + SYSTEM_INFO sysinfo; + unsigned int save_affinity, affinity; + GetSystemInfo( &sysinfo ); + cpuid->nproc = sysinfo.dwNumberOfProcessors; + apic_id = malloc(sizeof(int)*cpuid->nproc); + /* Get previous affinity mask */ + save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1); + for (i = 0; i < cpuid->nproc; i++) + { + SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<have_cpu_topology = 1; ++ ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, ++ hwthread_bits); ++ cpuid->have_cpu_topology = (ret == 0); +#endif + } + return 0; +} +#endif /* GMX_CPUID_X86 */ + + + + +static void +chomp_substring_before_colon(const char *in, char *s, int maxlength) +{ + char *p; + strncpy(s,in,maxlength); + p = strchr(s,':'); + if(p!=NULL) + { + *p='\0'; + while(isspace(*(--p)) && (p>=s)) + { + *p='\0'; + } + } + else + { + *s='\0'; + } +} + +static void +chomp_substring_after_colon(const char *in, char *s, int maxlength) +{ + char *p; + if( (p = strchr(in,':'))!=NULL) + { + p++; + while(isspace(*p)) p++; + strncpy(s,p,maxlength); + p = s+strlen(s); + while(isspace(*(--p)) && (p>=s)) + { + *p='\0'; + } + } + else + { + *s='\0'; + } +} + +/* Try to find the vendor of the current CPU, so we know what specific + * detection routine to call. + */ +static enum gmx_cpuid_vendor +cpuid_check_vendor(void) +{ + enum gmx_cpuid_vendor i, vendor; + /* Register data used on x86 */ + unsigned int eax, ebx, ecx, edx; + char vendorstring[13]; + FILE * fp; + char buffer[255],buffer2[255]; + + /* Set default first */ + vendor = GMX_CPUID_VENDOR_UNKNOWN; + +#ifdef GMX_CPUID_X86 + execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx); + + memcpy(vendorstring, &ebx, 4); + memcpy(vendorstring+4, &edx, 4); + memcpy(vendorstring+8, &ecx, 4); + + vendorstring[12] = '\0'; + + for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++) + { + if (!strncmp(vendorstring, gmx_cpuid_vendor_string[i], 12)) + { + vendor = i; + } + } +#elif defined(__linux__) || defined(__linux) + /* General Linux. Try to get CPU vendor from /proc/cpuinfo */ + if( (fp = fopen("/proc/cpuinfo","r")) != NULL) + { + while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL)) + { + chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2)); + /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */ + if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") ) + { + chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2)); + for(i=GMX_CPUID_VENDOR_UNKNOWN; ihave_cpu_topology) + { + *nprocessors = cpuid->nproc; + *npackages = cpuid->npackages; + *ncores_per_package = cpuid->ncores_per_package; + *nhwthreads_per_core = cpuid->nhwthreads_per_core; + *package_id = cpuid->package_id; + *core_id = cpuid->core_id; + *hwthread_id = cpuid->hwthread_id; + *locality_order = cpuid->locality_order; + rc = 0; + } + else + { + rc = -1; + } + return rc; +} + + +enum gmx_cpuid_x86_smt +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid) +{ + enum gmx_cpuid_x86_smt rc; + + if (cpuid->have_cpu_topology) + { + rc = (cpuid->nhwthreads_per_core > 1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED; + } + else if (cpuid->vendor == GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_HTT) == 0) + { + rc = GMX_CPUID_X86_SMT_DISABLED; + } + else + { + rc = GMX_CPUID_X86_SMT_CANNOTDETECT; + } + return rc; +} + + +int +gmx_cpuid_init (gmx_cpuid_t * pcpuid) +{ + gmx_cpuid_t cpuid; + int i; + FILE * fp; + char buffer[255],buffer2[255]; + int found_brand; + + cpuid = malloc(sizeof(*cpuid)); + + *pcpuid = cpuid; + + for (i = 0; i < GMX_CPUID_NFEATURES; i++) + { + cpuid->feature[i] = 0; + } + + cpuid->have_cpu_topology = 0; + cpuid->nproc = 0; + cpuid->npackages = 0; + cpuid->ncores_per_package = 0; + cpuid->nhwthreads_per_core = 0; + cpuid->package_id = NULL; + cpuid->core_id = NULL; + cpuid->hwthread_id = NULL; + cpuid->locality_order = NULL; + + cpuid->vendor = cpuid_check_vendor(); + + switch (cpuid->vendor) + { +#ifdef GMX_CPUID_X86 + case GMX_CPUID_VENDOR_INTEL: + cpuid_check_intel_x86(cpuid); + break; + case GMX_CPUID_VENDOR_AMD: + cpuid_check_amd_x86(cpuid); + break; +#endif + default: + /* Default value */ + strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN); +#if defined(__linux__) || defined(__linux) + /* General Linux. Try to get CPU type from /proc/cpuinfo */ + if( (fp = fopen("/proc/cpuinfo","r")) != NULL) + { + found_brand = 0; + while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL)) + { + chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2)); + /* Intel uses "model name", Fujitsu and IBM "cpu". */ + if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu")) + { + chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN); + found_brand = 1; + } + } + } + fclose(fp); +#endif + cpuid->family = 0; + cpuid->model = 0; + cpuid->stepping = 0; + + for(i=0; ifeature[i]=0; + } + cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1; + break; + } + return 0; +} + + + +void +gmx_cpuid_done (gmx_cpuid_t cpuid) +{ + free(cpuid); +} + + +int +gmx_cpuid_formatstring (gmx_cpuid_t cpuid, + char * str, + int n) +{ + int c; + int i; + enum gmx_cpuid_feature feature; + +#ifdef _MSC_VER + _snprintf(str, n, + "Vendor: %s\n" + "Brand: %s\n" + "Family: %2d Model: %2d Stepping: %2d\n" + "Features:", + gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)], + gmx_cpuid_brand(cpuid), + gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid)); +#else + snprintf(str, n, + "Vendor: %s\n" + "Brand: %s\n" + "Family: %2d Model: %2d Stepping: %2d\n" + "Features:", + gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)], + gmx_cpuid_brand(cpuid), + gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid)); +#endif + + str[n-1] = '\0'; + c = strlen(str); + n -= c; + str += c; + + for (feature = GMX_CPUID_FEATURE_CANNOTDETECT; feature < GMX_CPUID_NFEATURES; feature++) + { + if (gmx_cpuid_feature(cpuid, feature) == 1) + { +#ifdef _MSC_VER + _snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]); +#else + snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]); +#endif + str[n-1] = '\0'; + c = strlen(str); + n -= c; + str += c; + } + } +#ifdef _MSC_VER + _snprintf(str, n, "\n"); +#else + snprintf(str, n, "\n"); +#endif + str[n-1] = '\0'; + + return 0; +} + + + +enum gmx_cpuid_acceleration +gmx_cpuid_acceleration_suggest (gmx_cpuid_t cpuid) +{ + enum gmx_cpuid_acceleration tmpacc; + + tmpacc = GMX_CPUID_ACCELERATION_NONE; + + if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL) + { + if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256; + } + else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1; + } + else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2; + } + } + else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD) + { + if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA; + } + else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1; + } + else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2)) + { + tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2; + } + } + else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU) + { + if(strstr(gmx_cpuid_brand(cpuid),"SPARC64")) + { + tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE; + } + } + return tmpacc; +} + + + +int +gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid, + FILE * log) +{ + int rc; + char str[1024]; + enum gmx_cpuid_acceleration acc; + + acc = gmx_cpuid_acceleration_suggest(cpuid); + + rc = (acc != compiled_acc); + + gmx_cpuid_formatstring(cpuid, str, 1023); + str[1023] = '\0'; + + if (log != NULL) + { + fprintf(log, + "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n" + "%s" + "Acceleration most likely to fit this hardware: %s\n" + "Acceleration selected at GROMACS compile time: %s\n\n", + str, + gmx_cpuid_acceleration_string[acc], + gmx_cpuid_acceleration_string[compiled_acc]); + } + + if (rc != 0) + { + if (log != NULL) + { + fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n" + "Acceleration most likely to fit this hardware: %s\n" + "Acceleration selected at GROMACS compile time: %s\n\n", + gmx_cpuid_acceleration_string[acc], + gmx_cpuid_acceleration_string[compiled_acc]); + } + printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n", + gmx_cpuid_acceleration_string[compiled_acc], + gmx_cpuid_acceleration_string[acc]); + } + return rc; +} + + +#ifdef GMX_CPUID_STANDALONE +/* Stand-alone program to enable queries of CPU features from Cmake. + * Note that you need to check inline ASM capabilities before compiling and set + * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work... + */ +int +main(int argc, char **argv) +{ + gmx_cpuid_t cpuid; + enum gmx_cpuid_acceleration acc; + int i, cnt; + + if (argc < 2) + { + fprintf(stdout, + "Usage:\n\n%s [flags]\n\n" + "Available flags:\n" + "-vendor Print CPU vendor.\n" + "-brand Print CPU brand string.\n" + "-family Print CPU family version.\n" + "-model Print CPU model version.\n" + "-stepping Print CPU stepping version.\n" + "-features Print CPU feature flags.\n" + "-acceleration Print suggested GROMACS acceleration.\n", + argv[0]); + exit(0); + } + + gmx_cpuid_init(&cpuid); + + if (!strncmp(argv[1], "-vendor", 3)) + { + printf("%s\n", gmx_cpuid_vendor_string[cpuid->vendor]); + } + else if (!strncmp(argv[1], "-brand", 3)) + { + printf("%s\n", cpuid->brand); + } + else if (!strncmp(argv[1], "-family", 3)) + { + printf("%d\n", cpuid->family); + } + else if (!strncmp(argv[1], "-model", 3)) + { + printf("%d\n", cpuid->model); + } + else if (!strncmp(argv[1], "-stepping", 3)) + { + printf("%d\n", cpuid->stepping); + } + else if (!strncmp(argv[1], "-features", 3)) + { + cnt = 0; + for (i = 0; i < GMX_CPUID_NFEATURES; i++) + { + if (cpuid->feature[i] == 1) + { + if (cnt++ > 0) + { + printf(" "); + } + printf("%s", gmx_cpuid_feature_string[i]); + } + } + printf("\n"); + } + else if (!strncmp(argv[1], "-acceleration", 3)) + { + acc = gmx_cpuid_acceleration_suggest(cpuid); + fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]); + } + + gmx_cpuid_done(cpuid); + + + return 0; +} + +#endif diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h index 7aeb7cb841,0000000000..2498f11c4e mode 100644,000000..100644 --- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h +++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h @@@ -1,1482 -1,0 +1,1477 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This file is part of GROMACS. + * Copyright (c) 2012- + * + * Written by the Gromacs development team under coordination of + * David van der Spoel, Berk Hess, and Erik Lindahl. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ +#ifndef _gmx_math_x86_sse4_1_double_h_ +#define _gmx_math_x86_sse4_1_double_h_ + +#include +#include + +#include "gmx_x86_sse4_1.h" + + + +#ifndef M_PI +# define M_PI 3.14159265358979323846264338327950288 +#endif + +/************************ + * * + * Simple math routines * + * * + ************************/ + +/* 1.0/sqrt(x) */ +static gmx_inline __m128d +gmx_mm_invsqrt_pd(__m128d x) +{ + const __m128d half = _mm_set1_pd(0.5); + const __m128d three = _mm_set1_pd(3.0); + + /* Lookup instruction only exists in single precision, convert back and forth... */ + __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x))); + + lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu)); + return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu)); +} + +/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */ +static void +gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2) +{ + const __m128d half = _mm_set1_pd(0.5); + const __m128d three = _mm_set1_pd(3.0); + const __m128 halff = _mm_set1_ps(0.5f); + const __m128 threef = _mm_set1_ps(3.0f); + + __m128 xf, luf; + __m128d lu1, lu2; + + /* Do first N-R step in float for 2x throughput */ + xf = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0)); + luf = _mm_rsqrt_ps(xf); + luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf)); + + lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2))); + lu1 = _mm_cvtps_pd(luf); + + *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1)); + *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2)); +} + +/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */ +static gmx_inline __m128d +gmx_mm_sqrt_pd(__m128d x) +{ + __m128d mask; + __m128d res; + + mask = _mm_cmpeq_pd(x, _mm_setzero_pd()); + res = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x)); + + res = _mm_mul_pd(x, res); + + return res; +} + +/* 1.0/x */ +static gmx_inline __m128d +gmx_mm_inv_pd(__m128d x) +{ + const __m128d two = _mm_set1_pd(2.0); + + /* Lookup instruction only exists in single precision, convert back and forth... */ + __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x))); + + /* Perform two N-R steps for double precision */ + lu = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu))); + return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu))); +} + +static gmx_inline __m128d +gmx_mm_abs_pd(__m128d x) +{ + const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) ); + + return _mm_and_pd(x, signmask); +} + + +/* + * 2^x function. + * + * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval + * [-0.5,0.5]. + * + * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)), + * according to the same algorithm as used in the Cephes/netlib math routines. + */ +static __m128d +gmx_mm_exp2_pd(__m128d x) +{ + /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ + const __m128d arglimit = _mm_set1_pd(1022.0); + const __m128i expbase = _mm_set1_epi32(1023); + + const __m128d P2 = _mm_set1_pd(2.30933477057345225087e-2); + const __m128d P1 = _mm_set1_pd(2.02020656693165307700e1); + const __m128d P0 = _mm_set1_pd(1.51390680115615096133e3); + /* Q2 == 1.0 */ + const __m128d Q1 = _mm_set1_pd(2.33184211722314911771e2); + const __m128d Q0 = _mm_set1_pd(4.36821166879210612817e3); + const __m128d one = _mm_set1_pd(1.0); + const __m128d two = _mm_set1_pd(2.0); + + __m128d valuemask; + __m128i iexppart; + __m128d fexppart; + __m128d intpart; + __m128d z, z2; + __m128d PolyP, PolyQ; + + iexppart = _mm_cvtpd_epi32(x); + intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT); + + /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent. + * To be able to shift it into the exponent for a double precision number we first need to + * shuffle so that the lower half contains the first element, and the upper half the second. + * This should really be done as a zero-extension, but since the next instructions will shift + * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out. + * (thus we just use element 2 from iexppart). + */ + iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0)); + + /* Do the shift operation on the 64-bit registers */ + iexppart = _mm_add_epi32(iexppart, expbase); + iexppart = _mm_slli_epi64(iexppart, 52); + + valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x)); + fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart)); + + z = _mm_sub_pd(x, intpart); + z2 = _mm_mul_pd(z, z); + + PolyP = _mm_mul_pd(P2, z2); + PolyP = _mm_add_pd(PolyP, P1); + PolyQ = _mm_add_pd(z2, Q1); + PolyP = _mm_mul_pd(PolyP, z2); + PolyQ = _mm_mul_pd(PolyQ, z2); + PolyP = _mm_add_pd(PolyP, P0); + PolyQ = _mm_add_pd(PolyQ, Q0); + PolyP = _mm_mul_pd(PolyP, z); + + z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP))); + z = _mm_add_pd(one, _mm_mul_pd(two, z)); + + z = _mm_mul_pd(z, fexppart); + + return z; +} + +/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x, + * but there will then be a small rounding error since we lose some precision due to the + * multiplication. This will then be magnified a lot by the exponential. + * + * Instead, we calculate the fractional part directly as a Padé approximation of + * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction + * remaining after 2^y, which avoids the precision-loss. + */ +static __m128d +gmx_mm_exp_pd(__m128d exparg) +{ + const __m128d argscale = _mm_set1_pd(1.4426950408889634073599); + /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ + const __m128d arglimit = _mm_set1_pd(1022.0); + const __m128i expbase = _mm_set1_epi32(1023); + + const __m128d invargscale0 = _mm_set1_pd(6.93145751953125e-1); + const __m128d invargscale1 = _mm_set1_pd(1.42860682030941723212e-6); + + const __m128d P2 = _mm_set1_pd(1.26177193074810590878e-4); + const __m128d P1 = _mm_set1_pd(3.02994407707441961300e-2); + /* P0 == 1.0 */ + const __m128d Q3 = _mm_set1_pd(3.00198505138664455042E-6); + const __m128d Q2 = _mm_set1_pd(2.52448340349684104192E-3); + const __m128d Q1 = _mm_set1_pd(2.27265548208155028766E-1); + /* Q0 == 2.0 */ + const __m128d one = _mm_set1_pd(1.0); + const __m128d two = _mm_set1_pd(2.0); + + __m128d valuemask; + __m128i iexppart; + __m128d fexppart; + __m128d intpart; + __m128d x, z, z2; + __m128d PolyP, PolyQ; + + x = _mm_mul_pd(exparg, argscale); + + iexppart = _mm_cvtpd_epi32(x); + intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT); + + /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent. + * To be able to shift it into the exponent for a double precision number we first need to + * shuffle so that the lower half contains the first element, and the upper half the second. + * This should really be done as a zero-extension, but since the next instructions will shift + * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out. + * (thus we just use element 2 from iexppart). + */ + iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0)); + + /* Do the shift operation on the 64-bit registers */ + iexppart = _mm_add_epi32(iexppart, expbase); + iexppart = _mm_slli_epi64(iexppart, 52); + + valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x)); + fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart)); + + z = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart)); + z = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart)); + + z2 = _mm_mul_pd(z, z); + + PolyQ = _mm_mul_pd(Q3, z2); + PolyQ = _mm_add_pd(PolyQ, Q2); + PolyP = _mm_mul_pd(P2, z2); + PolyQ = _mm_mul_pd(PolyQ, z2); + PolyP = _mm_add_pd(PolyP, P1); + PolyQ = _mm_add_pd(PolyQ, Q1); + PolyP = _mm_mul_pd(PolyP, z2); + PolyQ = _mm_mul_pd(PolyQ, z2); + PolyP = _mm_add_pd(PolyP, one); + PolyQ = _mm_add_pd(PolyQ, two); + + PolyP = _mm_mul_pd(PolyP, z); + + z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP))); + z = _mm_add_pd(one, _mm_mul_pd(two, z)); + + z = _mm_mul_pd(z, fexppart); + + return z; +} + + + +static __m128d +gmx_mm_log_pd(__m128d x) +{ + /* Same algorithm as cephes library */ + const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) ); + + const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */ + + const __m128d half = _mm_set1_pd(0.5); + const __m128d one = _mm_set1_pd(1.0); + const __m128d two = _mm_set1_pd(2.0); + const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0)); + + const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4); + const __m128d corr2 = _mm_set1_pd(0.693359375); + + const __m128d P5 = _mm_set1_pd(1.01875663804580931796e-4); + const __m128d P4 = _mm_set1_pd(4.97494994976747001425e-1); + const __m128d P3 = _mm_set1_pd(4.70579119878881725854e0); + const __m128d P2 = _mm_set1_pd(1.44989225341610930846e1); + const __m128d P1 = _mm_set1_pd(1.79368678507819816313e1); + const __m128d P0 = _mm_set1_pd(7.70838733755885391666e0); + + const __m128d Q4 = _mm_set1_pd(1.12873587189167450590e1); + const __m128d Q3 = _mm_set1_pd(4.52279145837532221105e1); + const __m128d Q2 = _mm_set1_pd(8.29875266912776603211e1); + const __m128d Q1 = _mm_set1_pd(7.11544750618563894466e1); + const __m128d Q0 = _mm_set1_pd(2.31251620126765340583e1); + + const __m128d R2 = _mm_set1_pd(-7.89580278884799154124e-1); + const __m128d R1 = _mm_set1_pd(1.63866645699558079767e1); + const __m128d R0 = _mm_set1_pd(-6.41409952958715622951e1); + + const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1); + const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2); + const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2); + + __m128d fexp; + __m128i iexp; + + __m128d mask1, mask2; + __m128d corr, t1, t2, q; + __m128d zA, yA, xA, zB, yB, xB, z; + __m128d polyR, polyS; + __m128d polyP1, polyP2, polyQ1, polyQ2; + + /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */ + fexp = _mm_and_pd(x, expmask); + iexp = gmx_mm_castpd_si128(fexp); + iexp = _mm_srli_epi64(iexp, 52); + iexp = _mm_sub_epi32(iexp, expbase_m1); + iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) ); + fexp = _mm_cvtepi32_pd(iexp); + + x = _mm_andnot_pd(expmask, x); + x = _mm_or_pd(x, one); + x = _mm_mul_pd(x, half); + + mask1 = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two); + mask2 = _mm_cmplt_pd(x, invsq2); + + fexp = _mm_sub_pd(fexp, _mm_and_pd(mask2, one)); + + /* If mask1 is set ('A') */ + zA = _mm_sub_pd(x, half); + t1 = _mm_blendv_pd( zA, x, mask2 ); + zA = _mm_sub_pd(t1, half); + t2 = _mm_blendv_pd( x, zA, mask2 ); + yA = _mm_mul_pd(half, _mm_add_pd(t2, one)); + + xA = _mm_mul_pd(zA, gmx_mm_inv_pd(yA)); + zA = _mm_mul_pd(xA, xA); + + /* EVALUATE POLY */ + polyR = _mm_mul_pd(R2, zA); + polyR = _mm_add_pd(polyR, R1); + polyR = _mm_mul_pd(polyR, zA); + polyR = _mm_add_pd(polyR, R0); + + polyS = _mm_add_pd(zA, S2); + polyS = _mm_mul_pd(polyS, zA); + polyS = _mm_add_pd(polyS, S1); + polyS = _mm_mul_pd(polyS, zA); + polyS = _mm_add_pd(polyS, S0); + + q = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS)); + zA = _mm_mul_pd(_mm_mul_pd(xA, zA), q); + + zA = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp)); + zA = _mm_add_pd(zA, xA); + zA = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp)); + + /* If mask1 is not set ('B') */ + corr = _mm_and_pd(mask2, x); + xB = _mm_add_pd(x, corr); + xB = _mm_sub_pd(xB, one); + zB = _mm_mul_pd(xB, xB); + + polyP1 = _mm_mul_pd(P5, zB); + polyP2 = _mm_mul_pd(P4, zB); + polyP1 = _mm_add_pd(polyP1, P3); + polyP2 = _mm_add_pd(polyP2, P2); + polyP1 = _mm_mul_pd(polyP1, zB); + polyP2 = _mm_mul_pd(polyP2, zB); + polyP1 = _mm_add_pd(polyP1, P1); + polyP2 = _mm_add_pd(polyP2, P0); + polyP1 = _mm_mul_pd(polyP1, xB); + polyP1 = _mm_add_pd(polyP1, polyP2); + + polyQ2 = _mm_mul_pd(Q4, zB); + polyQ1 = _mm_add_pd(zB, Q3); + polyQ2 = _mm_add_pd(polyQ2, Q2); + polyQ1 = _mm_mul_pd(polyQ1, zB); + polyQ2 = _mm_mul_pd(polyQ2, zB); + polyQ1 = _mm_add_pd(polyQ1, Q1); + polyQ2 = _mm_add_pd(polyQ2, Q0); + polyQ1 = _mm_mul_pd(polyQ1, xB); + polyQ1 = _mm_add_pd(polyQ1, polyQ2); + + fexp = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd())); + + q = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1)); + yB = _mm_mul_pd(_mm_mul_pd(xB, zB), q); + + yB = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp)); + yB = _mm_sub_pd(yB, _mm_mul_pd(half, zB)); + zB = _mm_add_pd(xB, yB); + zB = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp)); + + z = _mm_blendv_pd( zB, zA, mask1 ); + + return z; +} + + +static __m128d +gmx_mm_erf_pd(__m128d x) +{ + /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */ + const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4); + const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059); + const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446); + const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209); + const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151); + + const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5); + const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535); + const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423); + const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636); + const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773); + /* CAQ0 == 1.0 */ + const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875); + + /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */ + const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10); + const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658); + const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733); + const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252); + const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418); + const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075); + const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060); + const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930); + const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425); + const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717); + const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072); + const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199); + const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542); + const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993); + /* CBQ0 == 1.0 */ + + /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */ + const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771); + const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517); + const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996); + const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619); + const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852); + const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818); + const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937); + + const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584); + const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145); + const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224); + const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143); + const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565); + const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228); + /* CCQ0 == 1.0 */ + const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125); + + const __m128d one = _mm_set1_pd(1.0); + const __m128d two = _mm_set1_pd(2.0); + + const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) ); + + __m128d xabs, x2, x4, t, t2, w, w2; + __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1; + __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1; + __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1; + __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res; + __m128d mask, expmx2; + + /* Calculate erf() */ + xabs = gmx_mm_abs_pd(x); + x2 = _mm_mul_pd(x, x); + x4 = _mm_mul_pd(x2, x2); + + PolyAP0 = _mm_mul_pd(CAP4, x4); + PolyAP1 = _mm_mul_pd(CAP3, x4); + PolyAP0 = _mm_add_pd(PolyAP0, CAP2); + PolyAP1 = _mm_add_pd(PolyAP1, CAP1); + PolyAP0 = _mm_mul_pd(PolyAP0, x4); + PolyAP1 = _mm_mul_pd(PolyAP1, x2); + PolyAP0 = _mm_add_pd(PolyAP0, CAP0); + PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1); + + PolyAQ1 = _mm_mul_pd(CAQ5, x4); + PolyAQ0 = _mm_mul_pd(CAQ4, x4); + PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3); + PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2); + PolyAQ1 = _mm_mul_pd(PolyAQ1, x4); + PolyAQ0 = _mm_mul_pd(PolyAQ0, x4); + PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1); + PolyAQ0 = _mm_add_pd(PolyAQ0, one); + PolyAQ1 = _mm_mul_pd(PolyAQ1, x2); + PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1); + + res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0)); + res_erf = _mm_add_pd(CAoffset, res_erf); + res_erf = _mm_mul_pd(x, res_erf); + + /* Calculate erfc() in range [1,4.5] */ + t = _mm_sub_pd(xabs, one); + t2 = _mm_mul_pd(t, t); + + PolyBP0 = _mm_mul_pd(CBP6, t2); + PolyBP1 = _mm_mul_pd(CBP5, t2); + PolyBP0 = _mm_add_pd(PolyBP0, CBP4); + PolyBP1 = _mm_add_pd(PolyBP1, CBP3); + PolyBP0 = _mm_mul_pd(PolyBP0, t2); + PolyBP1 = _mm_mul_pd(PolyBP1, t2); + PolyBP0 = _mm_add_pd(PolyBP0, CBP2); + PolyBP1 = _mm_add_pd(PolyBP1, CBP1); + PolyBP0 = _mm_mul_pd(PolyBP0, t2); + PolyBP1 = _mm_mul_pd(PolyBP1, t); + PolyBP0 = _mm_add_pd(PolyBP0, CBP0); + PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1); + + PolyBQ1 = _mm_mul_pd(CBQ7, t2); + PolyBQ0 = _mm_mul_pd(CBQ6, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5); + PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t2); + PolyBQ0 = _mm_mul_pd(PolyBQ0, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3); + PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t2); + PolyBQ0 = _mm_mul_pd(PolyBQ0, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1); + PolyBQ0 = _mm_add_pd(PolyBQ0, one); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t); + PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1); + + res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0)); + + res_erfcB = _mm_mul_pd(res_erfcB, xabs); + + /* Calculate erfc() in range [4.5,inf] */ + w = gmx_mm_inv_pd(xabs); + w2 = _mm_mul_pd(w, w); + + PolyCP0 = _mm_mul_pd(CCP6, w2); + PolyCP1 = _mm_mul_pd(CCP5, w2); + PolyCP0 = _mm_add_pd(PolyCP0, CCP4); + PolyCP1 = _mm_add_pd(PolyCP1, CCP3); + PolyCP0 = _mm_mul_pd(PolyCP0, w2); + PolyCP1 = _mm_mul_pd(PolyCP1, w2); + PolyCP0 = _mm_add_pd(PolyCP0, CCP2); + PolyCP1 = _mm_add_pd(PolyCP1, CCP1); + PolyCP0 = _mm_mul_pd(PolyCP0, w2); + PolyCP1 = _mm_mul_pd(PolyCP1, w); + PolyCP0 = _mm_add_pd(PolyCP0, CCP0); + PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1); + + PolyCQ0 = _mm_mul_pd(CCQ6, w2); + PolyCQ1 = _mm_mul_pd(CCQ5, w2); + PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4); + PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3); + PolyCQ0 = _mm_mul_pd(PolyCQ0, w2); + PolyCQ1 = _mm_mul_pd(PolyCQ1, w2); + PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2); + PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1); + PolyCQ0 = _mm_mul_pd(PolyCQ0, w2); + PolyCQ1 = _mm_mul_pd(PolyCQ1, w); + PolyCQ0 = _mm_add_pd(PolyCQ0, one); + PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1); + + expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) ); + + res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0)); + res_erfcC = _mm_add_pd(res_erfcC, CCoffset); + res_erfcC = _mm_mul_pd(res_erfcC, w); + + mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5)); + res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask); + + res_erfc = _mm_mul_pd(res_erfc, expmx2); + + /* erfc(x<0) = 2-erfc(|x|) */ + mask = _mm_cmplt_pd(x, _mm_setzero_pd()); + res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask); + + /* Select erf() or erfc() */ + mask = _mm_cmplt_pd(xabs, one); + res = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask); + + return res; +} + + +static __m128d +gmx_mm_erfc_pd(__m128d x) +{ + /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */ + const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4); + const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059); + const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446); + const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209); + const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151); + + const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5); + const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535); + const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423); + const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636); + const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773); + /* CAQ0 == 1.0 */ + const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875); + + /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */ + const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10); + const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658); + const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733); + const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252); + const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418); + const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075); + const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060); + const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930); + const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425); + const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717); + const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072); + const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199); + const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542); + const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993); + /* CBQ0 == 1.0 */ + + /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */ + const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771); + const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517); + const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996); + const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619); + const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852); + const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818); + const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937); + + const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584); + const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145); + const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224); + const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143); + const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565); + const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228); + /* CCQ0 == 1.0 */ + const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125); + + const __m128d one = _mm_set1_pd(1.0); + const __m128d two = _mm_set1_pd(2.0); + + const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) ); + + __m128d xabs, x2, x4, t, t2, w, w2; + __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1; + __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1; + __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1; + __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res; + __m128d mask, expmx2; + + /* Calculate erf() */ + xabs = gmx_mm_abs_pd(x); + x2 = _mm_mul_pd(x, x); + x4 = _mm_mul_pd(x2, x2); + + PolyAP0 = _mm_mul_pd(CAP4, x4); + PolyAP1 = _mm_mul_pd(CAP3, x4); + PolyAP0 = _mm_add_pd(PolyAP0, CAP2); + PolyAP1 = _mm_add_pd(PolyAP1, CAP1); + PolyAP0 = _mm_mul_pd(PolyAP0, x4); + PolyAP1 = _mm_mul_pd(PolyAP1, x2); + PolyAP0 = _mm_add_pd(PolyAP0, CAP0); + PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1); + + PolyAQ1 = _mm_mul_pd(CAQ5, x4); + PolyAQ0 = _mm_mul_pd(CAQ4, x4); + PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3); + PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2); + PolyAQ1 = _mm_mul_pd(PolyAQ1, x4); + PolyAQ0 = _mm_mul_pd(PolyAQ0, x4); + PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1); + PolyAQ0 = _mm_add_pd(PolyAQ0, one); + PolyAQ1 = _mm_mul_pd(PolyAQ1, x2); + PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1); + + res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0)); + res_erf = _mm_add_pd(CAoffset, res_erf); + res_erf = _mm_mul_pd(x, res_erf); + + /* Calculate erfc() in range [1,4.5] */ + t = _mm_sub_pd(xabs, one); + t2 = _mm_mul_pd(t, t); + + PolyBP0 = _mm_mul_pd(CBP6, t2); + PolyBP1 = _mm_mul_pd(CBP5, t2); + PolyBP0 = _mm_add_pd(PolyBP0, CBP4); + PolyBP1 = _mm_add_pd(PolyBP1, CBP3); + PolyBP0 = _mm_mul_pd(PolyBP0, t2); + PolyBP1 = _mm_mul_pd(PolyBP1, t2); + PolyBP0 = _mm_add_pd(PolyBP0, CBP2); + PolyBP1 = _mm_add_pd(PolyBP1, CBP1); + PolyBP0 = _mm_mul_pd(PolyBP0, t2); + PolyBP1 = _mm_mul_pd(PolyBP1, t); + PolyBP0 = _mm_add_pd(PolyBP0, CBP0); + PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1); + + PolyBQ1 = _mm_mul_pd(CBQ7, t2); + PolyBQ0 = _mm_mul_pd(CBQ6, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5); + PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t2); + PolyBQ0 = _mm_mul_pd(PolyBQ0, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3); + PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t2); + PolyBQ0 = _mm_mul_pd(PolyBQ0, t2); + PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1); + PolyBQ0 = _mm_add_pd(PolyBQ0, one); + PolyBQ1 = _mm_mul_pd(PolyBQ1, t); + PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1); + + res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0)); + + res_erfcB = _mm_mul_pd(res_erfcB, xabs); + + /* Calculate erfc() in range [4.5,inf] */ + w = gmx_mm_inv_pd(xabs); + w2 = _mm_mul_pd(w, w); + + PolyCP0 = _mm_mul_pd(CCP6, w2); + PolyCP1 = _mm_mul_pd(CCP5, w2); + PolyCP0 = _mm_add_pd(PolyCP0, CCP4); + PolyCP1 = _mm_add_pd(PolyCP1, CCP3); + PolyCP0 = _mm_mul_pd(PolyCP0, w2); + PolyCP1 = _mm_mul_pd(PolyCP1, w2); + PolyCP0 = _mm_add_pd(PolyCP0, CCP2); + PolyCP1 = _mm_add_pd(PolyCP1, CCP1); + PolyCP0 = _mm_mul_pd(PolyCP0, w2); + PolyCP1 = _mm_mul_pd(PolyCP1, w); + PolyCP0 = _mm_add_pd(PolyCP0, CCP0); + PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1); + + PolyCQ0 = _mm_mul_pd(CCQ6, w2); + PolyCQ1 = _mm_mul_pd(CCQ5, w2); + PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4); + PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3); + PolyCQ0 = _mm_mul_pd(PolyCQ0, w2); + PolyCQ1 = _mm_mul_pd(PolyCQ1, w2); + PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2); + PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1); + PolyCQ0 = _mm_mul_pd(PolyCQ0, w2); + PolyCQ1 = _mm_mul_pd(PolyCQ1, w); + PolyCQ0 = _mm_add_pd(PolyCQ0, one); + PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1); + + expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) ); + + res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0)); + res_erfcC = _mm_add_pd(res_erfcC, CCoffset); + res_erfcC = _mm_mul_pd(res_erfcC, w); + + mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5)); + res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask); + + res_erfc = _mm_mul_pd(res_erfc, expmx2); + + /* erfc(x<0) = 2-erfc(|x|) */ + mask = _mm_cmplt_pd(x, _mm_setzero_pd()); + res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask); + + /* Select erf() or erfc() */ + mask = _mm_cmplt_pd(xabs, one); + res = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask); + + return res; +} + + +/* Calculate the force correction due to PME analytically. + * + * This routine is meant to enable analytical evaluation of the + * direct-space PME electrostatic force to avoid tables. + * + * The direct-space potential should be Erfc(beta*r)/r, but there + * are some problems evaluating that: + * + * First, the error function is difficult (read: expensive) to + * approxmiate accurately for intermediate to large arguments, and + * this happens already in ranges of beta*r that occur in simulations. + * Second, we now try to avoid calculating potentials in Gromacs but + * use forces directly. + * + * We can simply things slight by noting that the PME part is really + * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e. + * + * V= 1/r - Erf(beta*r)/r + * + * The first term we already have from the inverse square root, so + * that we can leave out of this routine. + * + * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm, + * the argument beta*r will be in the range 0.15 to ~4. Use your + * favorite plotting program to realize how well-behaved Erf(z)/z is + * in this range! + * + * We approximate f(z)=erf(z)/z with a rational minimax polynomial. + * However, it turns out it is more efficient to approximate f(z)/z and + * then only use even powers. This is another minor optimization, since + * we actually WANT f(z)/z, because it is going to be multiplied by + * the vector between the two atoms to get the vectorial force. The + * fastest flops are the ones we can avoid calculating! + * + * So, here's how it should be used: + * + * 1. Calculate r^2. + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2. + * 3. Evaluate this routine with z^2 as the argument. + * 4. The return value is the expression: + * + * + * 2*exp(-z^2) erf(z) + * ------------ - -------- + * sqrt(Pi)*z^2 z^3 + * + * 5. Multiply the entire expression by beta^3. This will get you + * + * beta^3*2*exp(-z^2) beta^3*erf(z) + * ------------------ - --------------- + * sqrt(Pi)*z^2 z^3 + * + * or, switching back to r (z=r*beta): + * + * 2*beta*exp(-r^2*beta^2) erf(r*beta) + * ----------------------- - ----------- + * sqrt(Pi)*r^2 r^3 + * + * + * With a bit of math exercise you should be able to confirm that + * this is exactly D[Erf[beta*r]/r,r] divided by r another time. + * + * 6. Add the result to 1/r^3, multiply by the product of the charges, + * and you have your force (divided by r). A final multiplication + * with the vector connecting the two particles and you have your + * vectorial force to add to the particles. + * + */ +static __m128d +gmx_mm_pmecorrF_pd(__m128d z2) +{ + const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14); + const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11); + const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10); + const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8); + const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7); + const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456); + const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005); + const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203); + const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405); + const __m128d FN1 = _mm_set1_pd(0.13884101728898463426); + const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847); + + const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701); + const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046); + const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008); + const __m128d FD2 = _mm_set1_pd(0.077181146026670287235); + const __m128d FD1 = _mm_set1_pd(0.41543303143712535988); + const __m128d FD0 = _mm_set1_pd(1.0); + + __m128d z4; + __m128d polyFN0, polyFN1, polyFD0, polyFD1; + + z4 = _mm_mul_pd(z2, z2); + + polyFD1 = _mm_mul_pd(FD5, z4); + polyFD0 = _mm_mul_pd(FD4, z4); + polyFD1 = _mm_add_pd(polyFD1, FD3); + polyFD0 = _mm_add_pd(polyFD0, FD2); + polyFD1 = _mm_mul_pd(polyFD1, z4); + polyFD0 = _mm_mul_pd(polyFD0, z4); + polyFD1 = _mm_add_pd(polyFD1, FD1); + polyFD0 = _mm_add_pd(polyFD0, FD0); + polyFD1 = _mm_mul_pd(polyFD1, z2); + polyFD0 = _mm_add_pd(polyFD0, polyFD1); + + polyFD0 = gmx_mm_inv_pd(polyFD0); + + polyFN0 = _mm_mul_pd(FN10, z4); + polyFN1 = _mm_mul_pd(FN9, z4); + polyFN0 = _mm_add_pd(polyFN0, FN8); + polyFN1 = _mm_add_pd(polyFN1, FN7); + polyFN0 = _mm_mul_pd(polyFN0, z4); + polyFN1 = _mm_mul_pd(polyFN1, z4); + polyFN0 = _mm_add_pd(polyFN0, FN6); + polyFN1 = _mm_add_pd(polyFN1, FN5); + polyFN0 = _mm_mul_pd(polyFN0, z4); + polyFN1 = _mm_mul_pd(polyFN1, z4); + polyFN0 = _mm_add_pd(polyFN0, FN4); + polyFN1 = _mm_add_pd(polyFN1, FN3); + polyFN0 = _mm_mul_pd(polyFN0, z4); + polyFN1 = _mm_mul_pd(polyFN1, z4); + polyFN0 = _mm_add_pd(polyFN0, FN2); + polyFN1 = _mm_add_pd(polyFN1, FN1); + polyFN0 = _mm_mul_pd(polyFN0, z4); + polyFN1 = _mm_mul_pd(polyFN1, z2); + polyFN0 = _mm_add_pd(polyFN0, FN0); + polyFN0 = _mm_add_pd(polyFN0, polyFN1); + + return _mm_mul_pd(polyFN0, polyFD0); +} + + + + +/* Calculate the potential correction due to PME analytically. + * + * See gmx_mm256_pmecorrF_ps() for details about the approximation. + * + * This routine calculates Erf(z)/z, although you should provide z^2 + * as the input argument. + * + * Here's how it should be used: + * + * 1. Calculate r^2. + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2. + * 3. Evaluate this routine with z^2 as the argument. + * 4. The return value is the expression: + * + * + * erf(z) + * -------- + * z + * + * 5. Multiply the entire expression by beta and switching back to r (z=r*beta): + * + * erf(r*beta) + * ----------- + * r + * + * 6. Subtract the result from 1/r, multiply by the product of the charges, + * and you have your potential. + * + */ +static __m128d +gmx_mm_pmecorrV_pd(__m128d z2) +{ + const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13); + const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10); + const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9); + const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7); + const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6); + const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389); + const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792); + const __m128d VN2 = _mm_set1_pd(0.045247661136833092885); + const __m128d VN1 = _mm_set1_pd(0.11643931522926034421); + const __m128d VN0 = _mm_set1_pd(1.1283791671726767970); + + const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342); + const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448); + const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504); + const __m128d VD2 = _mm_set1_pd(0.085608012351550627051); + const __m128d VD1 = _mm_set1_pd(0.43652499166614811084); + const __m128d VD0 = _mm_set1_pd(1.0); + + __m128d z4; + __m128d polyVN0, polyVN1, polyVD0, polyVD1; + + z4 = _mm_mul_pd(z2, z2); + + polyVD1 = _mm_mul_pd(VD5, z4); + polyVD0 = _mm_mul_pd(VD4, z4); + polyVD1 = _mm_add_pd(polyVD1, VD3); + polyVD0 = _mm_add_pd(polyVD0, VD2); + polyVD1 = _mm_mul_pd(polyVD1, z4); + polyVD0 = _mm_mul_pd(polyVD0, z4); + polyVD1 = _mm_add_pd(polyVD1, VD1); + polyVD0 = _mm_add_pd(polyVD0, VD0); + polyVD1 = _mm_mul_pd(polyVD1, z2); + polyVD0 = _mm_add_pd(polyVD0, polyVD1); + + polyVD0 = gmx_mm_inv_pd(polyVD0); + + polyVN1 = _mm_mul_pd(VN9, z4); + polyVN0 = _mm_mul_pd(VN8, z4); + polyVN1 = _mm_add_pd(polyVN1, VN7); + polyVN0 = _mm_add_pd(polyVN0, VN6); + polyVN1 = _mm_mul_pd(polyVN1, z4); + polyVN0 = _mm_mul_pd(polyVN0, z4); + polyVN1 = _mm_add_pd(polyVN1, VN5); + polyVN0 = _mm_add_pd(polyVN0, VN4); + polyVN1 = _mm_mul_pd(polyVN1, z4); + polyVN0 = _mm_mul_pd(polyVN0, z4); + polyVN1 = _mm_add_pd(polyVN1, VN3); + polyVN0 = _mm_add_pd(polyVN0, VN2); + polyVN1 = _mm_mul_pd(polyVN1, z4); + polyVN0 = _mm_mul_pd(polyVN0, z4); + polyVN1 = _mm_add_pd(polyVN1, VN1); + polyVN0 = _mm_add_pd(polyVN0, VN0); + polyVN1 = _mm_mul_pd(polyVN1, z2); + polyVN0 = _mm_add_pd(polyVN0, polyVN1); + + return _mm_mul_pd(polyVN0, polyVD0); +} + + +static int +gmx_mm_sincos_pd(__m128d x, + __m128d *sinval, + __m128d *cosval) +{ +#ifdef _MSC_VER + __declspec(align(16)) + const double sintable[34] = + { + 1.00000000000000000e+00, 0.00000000000000000e+00, + 9.95184726672196929e-01, 9.80171403295606036e-02, + 9.80785280403230431e-01, 1.95090322016128248e-01, + 9.56940335732208824e-01, 2.90284677254462331e-01, + 9.23879532511286738e-01, 3.82683432365089782e-01, + 8.81921264348355050e-01, 4.71396736825997642e-01, + 8.31469612302545236e-01, 5.55570233019602178e-01, + 7.73010453362736993e-01, 6.34393284163645488e-01, + 7.07106781186547573e-01, 7.07106781186547462e-01, + 6.34393284163645599e-01, 7.73010453362736882e-01, + 5.55570233019602289e-01, 8.31469612302545125e-01, + 4.71396736825997809e-01, 8.81921264348354939e-01, + 3.82683432365089837e-01, 9.23879532511286738e-01, + 2.90284677254462276e-01, 9.56940335732208935e-01, + 1.95090322016128304e-01, 9.80785280403230431e-01, + 9.80171403295607702e-02, 9.95184726672196818e-01, + 0.0, 1.00000000000000000e+00 + }; +#else + const __m128d sintable[17] = + { + _mm_set_pd( 0.0, 1.0 ), + _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ), + _mm_set_pd( 1.0, 0.0 ) + }; +#endif + + const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) ); - const __m128i signbit_epi32 = _mm_set1_epi32(0x80000000); + + const __m128d tabscale = _mm_set1_pd(32.0/M_PI); + const __m128d invtabscale0 = _mm_set1_pd(9.81747508049011230469e-02); + const __m128d invtabscale1 = _mm_set1_pd(1.96197799156550576057e-08); + const __m128i ione = _mm_set1_epi32(1); + const __m128i i32 = _mm_set1_epi32(32); + const __m128i i16 = _mm_set1_epi32(16); + const __m128i tabmask = _mm_set1_epi32(0x3F); + const __m128d sinP7 = _mm_set1_pd(-1.0/5040.0); + const __m128d sinP5 = _mm_set1_pd(1.0/120.0); + const __m128d sinP3 = _mm_set1_pd(-1.0/6.0); + const __m128d sinP1 = _mm_set1_pd(1.0); + + const __m128d cosP6 = _mm_set1_pd(-1.0/720.0); + const __m128d cosP4 = _mm_set1_pd(1.0/24.0); + const __m128d cosP2 = _mm_set1_pd(-1.0/2.0); + const __m128d cosP0 = _mm_set1_pd(1.0); + + __m128d scalex; + __m128i tabidx, corridx; + __m128d xabs, z, z2, polySin, polyCos; + __m128d xpoint; + __m128d ypoint0, ypoint1; + + __m128d sinpoint, cospoint; + __m128d xsign, ssign, csign; + __m128i imask, sswapsign, cswapsign; - __m128d minusone; + + xsign = _mm_andnot_pd(signmask, x); + xabs = _mm_and_pd(x, signmask); + + scalex = _mm_mul_pd(tabscale, xabs); + tabidx = _mm_cvtpd_epi32(scalex); + + xpoint = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT); + + /* Extended precision arithmetics */ + z = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint)); + z = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint)); + + /* Range reduction to 0..2*Pi */ + tabidx = _mm_and_si128(tabidx, tabmask); + + /* tabidx is now in range [0,..,64] */ + imask = _mm_cmpgt_epi32(tabidx, i32); + sswapsign = imask; + cswapsign = imask; + corridx = _mm_and_si128(imask, i32); + tabidx = _mm_sub_epi32(tabidx, corridx); + + /* tabidx is now in range [0..32] */ + imask = _mm_cmpgt_epi32(tabidx, i16); + cswapsign = _mm_xor_si128(cswapsign, imask); + corridx = _mm_sub_epi32(i32, tabidx); + tabidx = _mm_blendv_epi8(tabidx, corridx, imask); + /* tabidx is now in range [0..16] */ + ssign = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) ); + csign = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) ); + +#ifdef _MSC_VER + ypoint0 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0)); + ypoint1 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1)); +#else + ypoint0 = sintable[_mm_extract_epi32(tabidx, 0)]; + ypoint1 = sintable[_mm_extract_epi32(tabidx, 1)]; +#endif + sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1); + cospoint = _mm_unpacklo_pd(ypoint0, ypoint1); + + sinpoint = _mm_mul_pd(sinpoint, ssign); + cospoint = _mm_mul_pd(cospoint, csign); + + z2 = _mm_mul_pd(z, z); + + polySin = _mm_mul_pd(sinP7, z2); + polySin = _mm_add_pd(polySin, sinP5); + polySin = _mm_mul_pd(polySin, z2); + polySin = _mm_add_pd(polySin, sinP3); + polySin = _mm_mul_pd(polySin, z2); + polySin = _mm_add_pd(polySin, sinP1); + polySin = _mm_mul_pd(polySin, z); + + polyCos = _mm_mul_pd(cosP6, z2); + polyCos = _mm_add_pd(polyCos, cosP4); + polyCos = _mm_mul_pd(polyCos, z2); + polyCos = _mm_add_pd(polyCos, cosP2); + polyCos = _mm_mul_pd(polyCos, z2); + polyCos = _mm_add_pd(polyCos, cosP0); + + *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign); + *cosval = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) ); + + return 0; +} + +/* + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them + * will then call the sincos() routine and waste a factor 2 in performance! + */ +static __m128d +gmx_mm_sin_pd(__m128d x) +{ + __m128d s, c; + gmx_mm_sincos_pd(x, &s, &c); + return s; +} + +/* + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them + * will then call the sincos() routine and waste a factor 2 in performance! + */ +static __m128d +gmx_mm_cos_pd(__m128d x) +{ + __m128d s, c; + gmx_mm_sincos_pd(x, &s, &c); + return c; +} + + + +static __m128d +gmx_mm_tan_pd(__m128d x) +{ + __m128d sinval, cosval; + __m128d tanval; + + gmx_mm_sincos_pd(x, &sinval, &cosval); + + tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval)); + + return tanval; +} + + + +static __m128d +gmx_mm_asin_pd(__m128d x) +{ + /* Same algorithm as cephes library */ + const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) ); + const __m128d limit1 = _mm_set1_pd(0.625); + const __m128d limit2 = _mm_set1_pd(1e-8); + const __m128d one = _mm_set1_pd(1.0); - const __m128d halfpi = _mm_set1_pd(M_PI/2.0); + const __m128d quarterpi = _mm_set1_pd(M_PI/4.0); + const __m128d morebits = _mm_set1_pd(6.123233995736765886130e-17); + + const __m128d P5 = _mm_set1_pd(4.253011369004428248960e-3); + const __m128d P4 = _mm_set1_pd(-6.019598008014123785661e-1); + const __m128d P3 = _mm_set1_pd(5.444622390564711410273e0); + const __m128d P2 = _mm_set1_pd(-1.626247967210700244449e1); + const __m128d P1 = _mm_set1_pd(1.956261983317594739197e1); + const __m128d P0 = _mm_set1_pd(-8.198089802484824371615e0); + + const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896e1); + const __m128d Q3 = _mm_set1_pd(7.049610280856842141659e1); + const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859e2); + const __m128d Q1 = _mm_set1_pd(1.395105614657485689735e2); + const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097e1); + + const __m128d R4 = _mm_set1_pd(2.967721961301243206100e-3); + const __m128d R3 = _mm_set1_pd(-5.634242780008963776856e-1); + const __m128d R2 = _mm_set1_pd(6.968710824104713396794e0); + const __m128d R1 = _mm_set1_pd(-2.556901049652824852289e1); + const __m128d R0 = _mm_set1_pd(2.853665548261061424989e1); + + const __m128d S3 = _mm_set1_pd(-2.194779531642920639778e1); + const __m128d S2 = _mm_set1_pd(1.470656354026814941758e2); + const __m128d S1 = _mm_set1_pd(-3.838770957603691357202e2); + const __m128d S0 = _mm_set1_pd(3.424398657913078477438e2); + + __m128d sign; + __m128d mask; + __m128d xabs; - __m128d zz, ww, z, q, w, y, zz2, ww2; ++ __m128d zz, ww, z, q, w, zz2, ww2; + __m128d PA, PB; + __m128d QA, QB; + __m128d RA, RB; + __m128d SA, SB; + __m128d nom, denom; + + sign = _mm_andnot_pd(signmask, x); + xabs = _mm_and_pd(x, signmask); + + mask = _mm_cmpgt_pd(xabs, limit1); + + zz = _mm_sub_pd(one, xabs); + ww = _mm_mul_pd(xabs, xabs); + zz2 = _mm_mul_pd(zz, zz); + ww2 = _mm_mul_pd(ww, ww); + + /* R */ + RA = _mm_mul_pd(R4, zz2); + RB = _mm_mul_pd(R3, zz2); + RA = _mm_add_pd(RA, R2); + RB = _mm_add_pd(RB, R1); + RA = _mm_mul_pd(RA, zz2); + RB = _mm_mul_pd(RB, zz); + RA = _mm_add_pd(RA, R0); + RA = _mm_add_pd(RA, RB); + + /* S, SA = zz2 */ + SB = _mm_mul_pd(S3, zz2); + SA = _mm_add_pd(zz2, S2); + SB = _mm_add_pd(SB, S1); + SA = _mm_mul_pd(SA, zz2); + SB = _mm_mul_pd(SB, zz); + SA = _mm_add_pd(SA, S0); + SA = _mm_add_pd(SA, SB); + + /* P */ + PA = _mm_mul_pd(P5, ww2); + PB = _mm_mul_pd(P4, ww2); + PA = _mm_add_pd(PA, P3); + PB = _mm_add_pd(PB, P2); + PA = _mm_mul_pd(PA, ww2); + PB = _mm_mul_pd(PB, ww2); + PA = _mm_add_pd(PA, P1); + PB = _mm_add_pd(PB, P0); + PA = _mm_mul_pd(PA, ww); + PA = _mm_add_pd(PA, PB); + + /* Q, QA = ww2 */ + QB = _mm_mul_pd(Q4, ww2); + QA = _mm_add_pd(ww2, Q3); + QB = _mm_add_pd(QB, Q2); + QA = _mm_mul_pd(QA, ww2); + QB = _mm_mul_pd(QB, ww2); + QA = _mm_add_pd(QA, Q1); + QB = _mm_add_pd(QB, Q0); + QA = _mm_mul_pd(QA, ww); + QA = _mm_add_pd(QA, QB); + + RA = _mm_mul_pd(RA, zz); + PA = _mm_mul_pd(PA, ww); + + nom = _mm_blendv_pd( PA, RA, mask ); + denom = _mm_blendv_pd( QA, SA, mask ); + + q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) ); + + zz = _mm_add_pd(zz, zz); + zz = gmx_mm_sqrt_pd(zz); + z = _mm_sub_pd(quarterpi, zz); + zz = _mm_mul_pd(zz, q); + zz = _mm_sub_pd(zz, morebits); + z = _mm_sub_pd(z, zz); + z = _mm_add_pd(z, quarterpi); + + w = _mm_mul_pd(xabs, q); + w = _mm_add_pd(w, xabs); + + z = _mm_blendv_pd( w, z, mask ); + + mask = _mm_cmpgt_pd(xabs, limit2); + z = _mm_blendv_pd( xabs, z, mask ); + + z = _mm_xor_pd(z, sign); + + return z; +} + + +static __m128d +gmx_mm_acos_pd(__m128d x) +{ - const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) ); + const __m128d one = _mm_set1_pd(1.0); + const __m128d half = _mm_set1_pd(0.5); - const __m128d pi = _mm_set1_pd(M_PI); + const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1); + const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17); + + + __m128d mask1; + + __m128d z, z1, z2; + + mask1 = _mm_cmpgt_pd(x, half); + z1 = _mm_mul_pd(half, _mm_sub_pd(one, x)); + z1 = gmx_mm_sqrt_pd(z1); + z = _mm_blendv_pd( x, z1, mask1 ); + + z = gmx_mm_asin_pd(z); + + z1 = _mm_add_pd(z, z); + + z2 = _mm_sub_pd(quarterpi0, z); + z2 = _mm_add_pd(z2, quarterpi1); + z2 = _mm_add_pd(z2, quarterpi0); + + z = _mm_blendv_pd(z2, z1, mask1); + + return z; +} + +static __m128d +gmx_mm_atan_pd(__m128d x) +{ + /* Same algorithm as cephes library */ + const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) ); + const __m128d limit1 = _mm_set1_pd(0.66); + const __m128d limit2 = _mm_set1_pd(2.41421356237309504880); + const __m128d quarterpi = _mm_set1_pd(M_PI/4.0); + const __m128d halfpi = _mm_set1_pd(M_PI/2.0); + const __m128d mone = _mm_set1_pd(-1.0); + const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17); + const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17); + + const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1); + const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1); + const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1); + const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2); + const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1); + + const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1); + const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2); + const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2); + const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2); + const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2); + + __m128d sign; + __m128d mask1, mask2; + __m128d y, t1, t2; + __m128d z, z2; + __m128d P_A, P_B, Q_A, Q_B; + + sign = _mm_andnot_pd(signmask, x); + x = _mm_and_pd(x, signmask); + + mask1 = _mm_cmpgt_pd(x, limit1); + mask2 = _mm_cmpgt_pd(x, limit2); + + t1 = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone))); + t2 = _mm_mul_pd(mone, gmx_mm_inv_pd(x)); + + y = _mm_and_pd(mask1, quarterpi); + y = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) ); + + x = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) ); + x = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) ); + + z = _mm_mul_pd(x, x); + z2 = _mm_mul_pd(z, z); + + P_A = _mm_mul_pd(P4, z2); + P_B = _mm_mul_pd(P3, z2); + P_A = _mm_add_pd(P_A, P2); + P_B = _mm_add_pd(P_B, P1); + P_A = _mm_mul_pd(P_A, z2); + P_B = _mm_mul_pd(P_B, z); + P_A = _mm_add_pd(P_A, P0); + P_A = _mm_add_pd(P_A, P_B); + + /* Q_A = z2 */ + Q_B = _mm_mul_pd(Q4, z2); + Q_A = _mm_add_pd(z2, Q3); + Q_B = _mm_add_pd(Q_B, Q2); + Q_A = _mm_mul_pd(Q_A, z2); + Q_B = _mm_mul_pd(Q_B, z2); + Q_A = _mm_add_pd(Q_A, Q1); + Q_B = _mm_add_pd(Q_B, Q0); + Q_A = _mm_mul_pd(Q_A, z); + Q_A = _mm_add_pd(Q_A, Q_B); + + z = _mm_mul_pd(z, P_A); + z = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A)); + z = _mm_mul_pd(z, x); + z = _mm_add_pd(z, x); + + t1 = _mm_and_pd(mask1, morebits1); + t1 = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) ); + + z = _mm_add_pd(z, t1); + y = _mm_add_pd(y, z); + + y = _mm_xor_pd(y, sign); + + return y; +} + + +static __m128d +gmx_mm_atan2_pd(__m128d y, __m128d x) +{ + const __m128d pi = _mm_set1_pd(M_PI); + const __m128d minuspi = _mm_set1_pd(-M_PI); + const __m128d halfpi = _mm_set1_pd(M_PI/2.0); + const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0); + + __m128d z, z1, z3, z4; + __m128d w; + __m128d maskx_lt, maskx_eq; + __m128d masky_lt, masky_eq; + __m128d mask1, mask2, mask3, mask4, maskall; + + maskx_lt = _mm_cmplt_pd(x, _mm_setzero_pd()); + masky_lt = _mm_cmplt_pd(y, _mm_setzero_pd()); + maskx_eq = _mm_cmpeq_pd(x, _mm_setzero_pd()); + masky_eq = _mm_cmpeq_pd(y, _mm_setzero_pd()); + + z = _mm_mul_pd(y, gmx_mm_inv_pd(x)); + z = gmx_mm_atan_pd(z); + + mask1 = _mm_and_pd(maskx_eq, masky_lt); + mask2 = _mm_andnot_pd(maskx_lt, masky_eq); + mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq); + mask4 = _mm_and_pd(masky_eq, maskx_lt); + + maskall = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) ); + + z = _mm_andnot_pd(maskall, z); + z1 = _mm_and_pd(mask1, minushalfpi); + z3 = _mm_and_pd(mask3, halfpi); + z4 = _mm_and_pd(mask4, pi); + + z = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) ); + + w = _mm_blendv_pd(pi, minuspi, masky_lt); + w = _mm_and_pd(w, maskx_lt); + + w = _mm_andnot_pd(maskall, w); + + z = _mm_add_pd(z, w); + + return z; +} + +#endif /*_gmx_math_x86_sse4_1_double_h_ */ diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h index 1d0018d2b5,0000000000..ae0659047f mode 100644,000000..100644 --- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h +++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h @@@ -1,1165 -1,0 +1,1165 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This file is part of GROMACS. + * Copyright (c) 2012- + * + * Written by the Gromacs development team under coordination of + * David van der Spoel, Berk Hess, and Erik Lindahl. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ +#ifndef _gmx_math_x86_sse4_1_single_h_ +#define _gmx_math_x86_sse4_1_single_h_ + +#include +#include + +#include "gmx_x86_sse4_1.h" + + + +#ifndef M_PI +# define M_PI 3.14159265358979323846264338327950288 +#endif + + + + +/************************ + * * + * Simple math routines * + * * + ************************/ + +/* 1.0/sqrt(x) */ +static gmx_inline __m128 +gmx_mm_invsqrt_ps(__m128 x) +{ + const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5); + const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0); + + __m128 lu = _mm_rsqrt_ps(x); + + return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu)); +} + +/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */ +static gmx_inline __m128 +gmx_mm_sqrt_ps(__m128 x) +{ + __m128 mask; + __m128 res; + + mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); + res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x)); + + res = _mm_mul_ps(x, res); + + return res; +} + +/* 1.0/x */ +static gmx_inline __m128 +gmx_mm_inv_ps(__m128 x) +{ + const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f); + + __m128 lu = _mm_rcp_ps(x); + + return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x))); +} + +static gmx_inline __m128 +gmx_mm_abs_ps(__m128 x) +{ + const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) ); + + return _mm_and_ps(x, signmask); +} + + + +static __m128 +gmx_mm_log_ps(__m128 x) +{ + /* Same algorithm as cephes library */ + const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) ); + const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */ + const __m128 half = _mm_set1_ps(0.5f); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f)); + const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f); + const __m128 corr2 = _mm_set1_ps(0.693359375f); + + const __m128 CA_1 = _mm_set1_ps(0.070376836292f); + const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f); + const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f); + const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f); + const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f); + const __m128 CD_0 = _mm_set1_ps(1.386218787509749f); + const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f); + const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f); + const __m128 CE_1 = _mm_set1_ps(1.807420826584643f); + - __m128 fexp, fexp1; ++ __m128 fexp; + __m128i iexp; + __m128 mask; - __m128 x1, x2; ++ __m128 x2; + __m128 y; + __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE; + + /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */ + fexp = _mm_and_ps(x, expmask); + iexp = gmx_mm_castps_si128(fexp); + iexp = _mm_srli_epi32(iexp, 23); + iexp = _mm_sub_epi32(iexp, expbase_m1); + + x = _mm_andnot_ps(expmask, x); + x = _mm_or_ps(x, one); + x = _mm_mul_ps(x, half); + + mask = _mm_cmplt_ps(x, invsq2); + + x = _mm_add_ps(x, _mm_and_ps(mask, x)); + x = _mm_sub_ps(x, one); + iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */ + + x2 = _mm_mul_ps(x, x); + + pA = _mm_mul_ps(CA_1, x); + pB = _mm_mul_ps(CB_1, x); + pC = _mm_mul_ps(CC_1, x); + pD = _mm_mul_ps(CD_1, x); + pE = _mm_mul_ps(CE_1, x); + tB = _mm_add_ps(CB_0, x2); + tC = _mm_add_ps(CC_0, x2); + tD = _mm_add_ps(CD_0, x2); + tE = _mm_add_ps(CE_0, x2); + pB = _mm_add_ps(pB, tB); + pC = _mm_add_ps(pC, tC); + pD = _mm_add_ps(pD, tD); + pE = _mm_add_ps(pE, tE); + + pA = _mm_mul_ps(pA, pB); + pC = _mm_mul_ps(pC, pD); + pE = _mm_mul_ps(pE, x2); + pA = _mm_mul_ps(pA, pC); + y = _mm_mul_ps(pA, pE); + + fexp = _mm_cvtepi32_ps(iexp); + y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1)); + + y = _mm_sub_ps(y, _mm_mul_ps(half, x2)); + x2 = _mm_add_ps(x, y); + + x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2)); + + return x2; +} + + +/* + * 2^x function. + * + * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval + * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command: + * + * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15] + * + * The largest-magnitude exponent we can represent in IEEE single-precision binary format + * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the + * result to zero if the argument falls outside this range. For small numbers this is just fine, but + * for large numbers you could be fancy and return the smallest/largest IEEE single-precision + * number instead. That would take a few extra cycles and not really help, since something is + * wrong if you are using single precision to work with numbers that cannot really be represented + * in single precision. + * + * The accuracy is at least 23 bits. + */ +static __m128 +gmx_mm_exp2_ps(__m128 x) +{ + /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ + const __m128 arglimit = _mm_set1_ps(126.0f); + + const __m128i expbase = _mm_set1_epi32(127); + const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004); + const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003); + const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003); + const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002); + const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001); + const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001); + const __m128 CA0 = _mm_set1_ps(1.0f); + + __m128 valuemask; + __m128i iexppart; + __m128 fexppart; + __m128 intpart; + __m128 x2; + __m128 p0, p1; + + iexppart = _mm_cvtps_epi32(x); + intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); + iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23); + valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x)); + fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart)); + + x = _mm_sub_ps(x, intpart); + x2 = _mm_mul_ps(x, x); + + p0 = _mm_mul_ps(CA6, x2); + p1 = _mm_mul_ps(CA5, x2); + p0 = _mm_add_ps(p0, CA4); + p1 = _mm_add_ps(p1, CA3); + p0 = _mm_mul_ps(p0, x2); + p1 = _mm_mul_ps(p1, x2); + p0 = _mm_add_ps(p0, CA2); + p1 = _mm_add_ps(p1, CA1); + p0 = _mm_mul_ps(p0, x2); + p1 = _mm_mul_ps(p1, x); + p0 = _mm_add_ps(p0, CA0); + p0 = _mm_add_ps(p0, p1); + x = _mm_mul_ps(p0, fexppart); + + return x; +} + + +/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x, + * but there will then be a small rounding error since we lose some precision due to the + * multiplication. This will then be magnified a lot by the exponential. + * + * Instead, we calculate the fractional part directly as a minimax approximation of + * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction + * remaining after 2^y, which avoids the precision-loss. + * The final result is correct to within 1 LSB over the entire argument range. + */ +static __m128 +gmx_mm_exp_ps(__m128 x) +{ + const __m128 argscale = _mm_set1_ps(1.44269504088896341f); + /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */ + const __m128 arglimit = _mm_set1_ps(126.0f); + const __m128i expbase = _mm_set1_epi32(127); + + const __m128 invargscale0 = _mm_set1_ps(0.693359375f); + const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f); + + const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f); + const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f); + const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f); + const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f); + const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f); + const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f); + const __m128 one = _mm_set1_ps(1.0f); + + __m128 y, x2; + __m128 p0, p1; + __m128 valuemask; + __m128i iexppart; + __m128 fexppart; + __m128 intpart; + + y = _mm_mul_ps(x, argscale); + + iexppart = _mm_cvtps_epi32(y); + intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT); + + iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23); + valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y)); + fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart)); + + /* Extended precision arithmetics */ + x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart)); + x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart)); + + x2 = _mm_mul_ps(x, x); + + p1 = _mm_mul_ps(CC5, x2); + p0 = _mm_mul_ps(CC4, x2); + p1 = _mm_add_ps(p1, CC3); + p0 = _mm_add_ps(p0, CC2); + p1 = _mm_mul_ps(p1, x2); + p0 = _mm_mul_ps(p0, x2); + p1 = _mm_add_ps(p1, CC1); + p0 = _mm_add_ps(p0, CC0); + p1 = _mm_mul_ps(p1, x); + p0 = _mm_add_ps(p0, p1); + p0 = _mm_mul_ps(p0, x2); + x = _mm_add_ps(x, one); + x = _mm_add_ps(x, p0); + + x = _mm_mul_ps(x, fexppart); + + return x; +} + +/* FULL precision. Only errors in LSB */ +static __m128 +gmx_mm_erf_ps(__m128 x) +{ + /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */ + const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f); + const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f); + const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f); + const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f); + const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f); + const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f); + const __m128 CA0 = _mm_set1_ps(1.128379165726710f); + /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */ + const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f); + const __m128 CB8 = _mm_set1_ps(0.003909821287598495f); + const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f); + const __m128 CB6 = _mm_set1_ps(0.005685614362160572f); + const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f); + const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f); + const __m128 CB3 = _mm_set1_ps(0.04369575504816542f); + const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f); + const __m128 CB1 = _mm_set1_ps(0.2732120154030589f); + const __m128 CB0 = _mm_set1_ps(0.42758357702025784f); + /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */ + const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f); + const __m128 CC9 = _mm_set1_ps(0.21376355144663348f); + const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f); + const __m128 CC7 = _mm_set1_ps(0.016690861551248114f); + const __m128 CC6 = _mm_set1_ps(0.7560973182491192f); + const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f); + const __m128 CC4 = _mm_set1_ps(0.8411872321232948f); + const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f); + const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f); + const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f); + const __m128 CC0 = _mm_set1_ps(0.5642114853803148f); + + /* Coefficients for expansion of exp(x) in [0,0.1] */ + /* CD0 and CD1 are both 1.0, so no need to declare them separately */ + const __m128 CD2 = _mm_set1_ps(0.5000066608081202f); + const __m128 CD3 = _mm_set1_ps(0.1664795422874624f); + const __m128 CD4 = _mm_set1_ps(0.04379839977652482f); + + const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) ); + const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 two = _mm_set1_ps(2.0f); + + __m128 x2, x4, y; + __m128 z, q, t, t2, w, w2; + __m128 pA0, pA1, pB0, pB1, pC0, pC1; + __m128 expmx2, corr; + __m128 res_erf, res_erfc, res; + __m128 mask; + + /* Calculate erf() */ + x2 = _mm_mul_ps(x, x); + x4 = _mm_mul_ps(x2, x2); + + pA0 = _mm_mul_ps(CA6, x4); + pA1 = _mm_mul_ps(CA5, x4); + pA0 = _mm_add_ps(pA0, CA4); + pA1 = _mm_add_ps(pA1, CA3); + pA0 = _mm_mul_ps(pA0, x4); + pA1 = _mm_mul_ps(pA1, x4); + pA0 = _mm_add_ps(pA0, CA2); + pA1 = _mm_add_ps(pA1, CA1); + pA0 = _mm_mul_ps(pA0, x4); + pA1 = _mm_mul_ps(pA1, x2); + pA0 = _mm_add_ps(pA0, pA1); + pA0 = _mm_add_ps(pA0, CA0); + + res_erf = _mm_mul_ps(x, pA0); + + /* Calculate erfc */ + + y = gmx_mm_abs_ps(x); + t = gmx_mm_inv_ps(y); + w = _mm_sub_ps(t, one); + t2 = _mm_mul_ps(t, t); + w2 = _mm_mul_ps(w, w); + /* + * We cannot simply calculate exp(-x2) directly in single precision, since + * that will lose a couple of bits of precision due to the multiplication. + * Instead, we introduce x=z+w, where the last 12 bits of precision are in w. + * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)). + * + * The only drawback with this is that it requires TWO separate exponential + * evaluations, which would be horrible performance-wise. However, the argument + * for the second exp() call is always small, so there we simply use a + * low-order minimax expansion on [0,0.1]. + */ + + z = _mm_and_ps(y, sieve); + q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) ); + + corr = _mm_mul_ps(CD4, q); + corr = _mm_add_ps(corr, CD3); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, CD2); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, one); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, one); + + expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) ); + expmx2 = _mm_mul_ps(expmx2, corr); + + pB1 = _mm_mul_ps(CB9, w2); + pB0 = _mm_mul_ps(CB8, w2); + pB1 = _mm_add_ps(pB1, CB7); + pB0 = _mm_add_ps(pB0, CB6); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB5); + pB0 = _mm_add_ps(pB0, CB4); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB3); + pB0 = _mm_add_ps(pB0, CB2); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB1); + pB1 = _mm_mul_ps(pB1, w); + pB0 = _mm_add_ps(pB0, pB1); + pB0 = _mm_add_ps(pB0, CB0); + + pC0 = _mm_mul_ps(CC10, t2); + pC1 = _mm_mul_ps(CC9, t2); + pC0 = _mm_add_ps(pC0, CC8); + pC1 = _mm_add_ps(pC1, CC7); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC6); + pC1 = _mm_add_ps(pC1, CC5); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC4); + pC1 = _mm_add_ps(pC1, CC3); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC2); + pC1 = _mm_add_ps(pC1, CC1); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t); + pC0 = _mm_add_ps(pC0, pC1); + pC0 = _mm_add_ps(pC0, CC0); + pC0 = _mm_mul_ps(pC0, t); + + /* SELECT pB0 or pC0 for erfc() */ + mask = _mm_cmplt_ps(two, y); + res_erfc = _mm_blendv_ps(pB0, pC0, mask); + res_erfc = _mm_mul_ps(res_erfc, expmx2); + + /* erfc(x<0) = 2-erfc(|x|) */ + mask = _mm_cmplt_ps(x, _mm_setzero_ps()); + res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask); + + /* Select erf() or erfc() */ + mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f)); + res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask); + + return res; +} + + +/* FULL precision. Only errors in LSB */ +static __m128 +gmx_mm_erfc_ps(__m128 x) +{ + /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */ + const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f); + const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f); + const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f); + const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f); + const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f); + const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f); + const __m128 CA0 = _mm_set1_ps(1.128379165726710f); + /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */ + const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f); + const __m128 CB8 = _mm_set1_ps(0.003909821287598495f); + const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f); + const __m128 CB6 = _mm_set1_ps(0.005685614362160572f); + const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f); + const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f); + const __m128 CB3 = _mm_set1_ps(0.04369575504816542f); + const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f); + const __m128 CB1 = _mm_set1_ps(0.2732120154030589f); + const __m128 CB0 = _mm_set1_ps(0.42758357702025784f); + /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */ + const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f); + const __m128 CC9 = _mm_set1_ps(0.21376355144663348f); + const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f); + const __m128 CC7 = _mm_set1_ps(0.016690861551248114f); + const __m128 CC6 = _mm_set1_ps(0.7560973182491192f); + const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f); + const __m128 CC4 = _mm_set1_ps(0.8411872321232948f); + const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f); + const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f); + const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f); + const __m128 CC0 = _mm_set1_ps(0.5642114853803148f); + + /* Coefficients for expansion of exp(x) in [0,0.1] */ + /* CD0 and CD1 are both 1.0, so no need to declare them separately */ + const __m128 CD2 = _mm_set1_ps(0.5000066608081202f); + const __m128 CD3 = _mm_set1_ps(0.1664795422874624f); + const __m128 CD4 = _mm_set1_ps(0.04379839977652482f); + + const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) ); + const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 two = _mm_set1_ps(2.0f); + + __m128 x2, x4, y; + __m128 z, q, t, t2, w, w2; + __m128 pA0, pA1, pB0, pB1, pC0, pC1; + __m128 expmx2, corr; + __m128 res_erf, res_erfc, res; + __m128 mask; + + /* Calculate erf() */ + x2 = _mm_mul_ps(x, x); + x4 = _mm_mul_ps(x2, x2); + + pA0 = _mm_mul_ps(CA6, x4); + pA1 = _mm_mul_ps(CA5, x4); + pA0 = _mm_add_ps(pA0, CA4); + pA1 = _mm_add_ps(pA1, CA3); + pA0 = _mm_mul_ps(pA0, x4); + pA1 = _mm_mul_ps(pA1, x4); + pA0 = _mm_add_ps(pA0, CA2); + pA1 = _mm_add_ps(pA1, CA1); + pA0 = _mm_mul_ps(pA0, x4); + pA1 = _mm_mul_ps(pA1, x2); + pA0 = _mm_add_ps(pA0, pA1); + pA0 = _mm_add_ps(pA0, CA0); + + res_erf = _mm_mul_ps(x, pA0); + + /* Calculate erfc */ + y = gmx_mm_abs_ps(x); + t = gmx_mm_inv_ps(y); + w = _mm_sub_ps(t, one); + t2 = _mm_mul_ps(t, t); + w2 = _mm_mul_ps(w, w); + /* + * We cannot simply calculate exp(-x2) directly in single precision, since + * that will lose a couple of bits of precision due to the multiplication. + * Instead, we introduce x=z+w, where the last 12 bits of precision are in w. + * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)). + * + * The only drawback with this is that it requires TWO separate exponential + * evaluations, which would be horrible performance-wise. However, the argument + * for the second exp() call is always small, so there we simply use a + * low-order minimax expansion on [0,0.1]. + */ + + z = _mm_and_ps(y, sieve); + q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) ); + + corr = _mm_mul_ps(CD4, q); + corr = _mm_add_ps(corr, CD3); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, CD2); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, one); + corr = _mm_mul_ps(corr, q); + corr = _mm_add_ps(corr, one); + + expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) ); + expmx2 = _mm_mul_ps(expmx2, corr); + + pB1 = _mm_mul_ps(CB9, w2); + pB0 = _mm_mul_ps(CB8, w2); + pB1 = _mm_add_ps(pB1, CB7); + pB0 = _mm_add_ps(pB0, CB6); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB5); + pB0 = _mm_add_ps(pB0, CB4); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB3); + pB0 = _mm_add_ps(pB0, CB2); + pB1 = _mm_mul_ps(pB1, w2); + pB0 = _mm_mul_ps(pB0, w2); + pB1 = _mm_add_ps(pB1, CB1); + pB1 = _mm_mul_ps(pB1, w); + pB0 = _mm_add_ps(pB0, pB1); + pB0 = _mm_add_ps(pB0, CB0); + + pC0 = _mm_mul_ps(CC10, t2); + pC1 = _mm_mul_ps(CC9, t2); + pC0 = _mm_add_ps(pC0, CC8); + pC1 = _mm_add_ps(pC1, CC7); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC6); + pC1 = _mm_add_ps(pC1, CC5); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC4); + pC1 = _mm_add_ps(pC1, CC3); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t2); + pC0 = _mm_add_ps(pC0, CC2); + pC1 = _mm_add_ps(pC1, CC1); + pC0 = _mm_mul_ps(pC0, t2); + pC1 = _mm_mul_ps(pC1, t); + pC0 = _mm_add_ps(pC0, pC1); + pC0 = _mm_add_ps(pC0, CC0); + pC0 = _mm_mul_ps(pC0, t); + + /* SELECT pB0 or pC0 for erfc() */ + mask = _mm_cmplt_ps(two, y); + res_erfc = _mm_blendv_ps(pB0, pC0, mask); + res_erfc = _mm_mul_ps(res_erfc, expmx2); + + /* erfc(x<0) = 2-erfc(|x|) */ + mask = _mm_cmplt_ps(x, _mm_setzero_ps()); + res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask); + + /* Select erf() or erfc() */ + mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f)); + res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask); + + return res; +} + + +/* Calculate the force correction due to PME analytically. + * + * This routine is meant to enable analytical evaluation of the + * direct-space PME electrostatic force to avoid tables. + * + * The direct-space potential should be Erfc(beta*r)/r, but there + * are some problems evaluating that: + * + * First, the error function is difficult (read: expensive) to + * approxmiate accurately for intermediate to large arguments, and + * this happens already in ranges of beta*r that occur in simulations. + * Second, we now try to avoid calculating potentials in Gromacs but + * use forces directly. + * + * We can simply things slight by noting that the PME part is really + * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e. + * + * V= 1/r - Erf(beta*r)/r + * + * The first term we already have from the inverse square root, so + * that we can leave out of this routine. + * + * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm, + * the argument beta*r will be in the range 0.15 to ~4. Use your + * favorite plotting program to realize how well-behaved Erf(z)/z is + * in this range! + * + * We approximate f(z)=erf(z)/z with a rational minimax polynomial. + * However, it turns out it is more efficient to approximate f(z)/z and + * then only use even powers. This is another minor optimization, since + * we actually WANT f(z)/z, because it is going to be multiplied by + * the vector between the two atoms to get the vectorial force. The + * fastest flops are the ones we can avoid calculating! + * + * So, here's how it should be used: + * + * 1. Calculate r^2. + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2. + * 3. Evaluate this routine with z^2 as the argument. + * 4. The return value is the expression: + * + * + * 2*exp(-z^2) erf(z) + * ------------ - -------- + * sqrt(Pi)*z^2 z^3 + * + * 5. Multiply the entire expression by beta^3. This will get you + * + * beta^3*2*exp(-z^2) beta^3*erf(z) + * ------------------ - --------------- + * sqrt(Pi)*z^2 z^3 + * + * or, switching back to r (z=r*beta): + * + * 2*beta*exp(-r^2*beta^2) erf(r*beta) + * ----------------------- - ----------- + * sqrt(Pi)*r^2 r^3 + * + * + * With a bit of math exercise you should be able to confirm that + * this is exactly D[Erf[beta*r]/r,r] divided by r another time. + * + * 6. Add the result to 1/r^3, multiply by the product of the charges, + * and you have your force (divided by r). A final multiplication + * with the vector connecting the two particles and you have your + * vectorial force to add to the particles. + * + */ +static gmx_inline __m128 +gmx_mm_pmecorrF_ps(__m128 z2) +{ + const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f); + const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f); + const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f); + const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f); + const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f); + const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f); + const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f); + + const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f); + const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f); + const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f); + const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f); + const __m128 FD0 = _mm_set1_ps(1.0f); + + __m128 z4; + __m128 polyFN0, polyFN1, polyFD0, polyFD1; + + z4 = _mm_mul_ps(z2, z2); + + polyFD0 = _mm_mul_ps(FD4, z4); + polyFD1 = _mm_mul_ps(FD3, z4); + polyFD0 = _mm_add_ps(polyFD0, FD2); + polyFD1 = _mm_add_ps(polyFD1, FD1); + polyFD0 = _mm_mul_ps(polyFD0, z4); + polyFD1 = _mm_mul_ps(polyFD1, z2); + polyFD0 = _mm_add_ps(polyFD0, FD0); + polyFD0 = _mm_add_ps(polyFD0, polyFD1); + + polyFD0 = gmx_mm_inv_ps(polyFD0); + + polyFN0 = _mm_mul_ps(FN6, z4); + polyFN1 = _mm_mul_ps(FN5, z4); + polyFN0 = _mm_add_ps(polyFN0, FN4); + polyFN1 = _mm_add_ps(polyFN1, FN3); + polyFN0 = _mm_mul_ps(polyFN0, z4); + polyFN1 = _mm_mul_ps(polyFN1, z4); + polyFN0 = _mm_add_ps(polyFN0, FN2); + polyFN1 = _mm_add_ps(polyFN1, FN1); + polyFN0 = _mm_mul_ps(polyFN0, z4); + polyFN1 = _mm_mul_ps(polyFN1, z2); + polyFN0 = _mm_add_ps(polyFN0, FN0); + polyFN0 = _mm_add_ps(polyFN0, polyFN1); + + return _mm_mul_ps(polyFN0, polyFD0); +} + + +/* Calculate the potential correction due to PME analytically. + * + * See gmx_mm256_pmecorrF_ps() for details about the approximation. + * + * This routine calculates Erf(z)/z, although you should provide z^2 + * as the input argument. + * + * Here's how it should be used: + * + * 1. Calculate r^2. + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2. + * 3. Evaluate this routine with z^2 as the argument. + * 4. The return value is the expression: + * + * + * erf(z) + * -------- + * z + * + * 5. Multiply the entire expression by beta and switching back to r (z=r*beta): + * + * erf(r*beta) + * ----------- + * r + * + * 6. Subtract the result from 1/r, multiply by the product of the charges, + * and you have your potential. + */ +static gmx_inline __m128 +gmx_mm_pmecorrV_ps(__m128 z2) +{ + const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f); + const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f); + const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f); + const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f); + const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f); + const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f); + const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f); + + const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f); + const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f); + const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f); + const __m128 VD0 = _mm_set1_ps(1.0f); + + __m128 z4; + __m128 polyVN0, polyVN1, polyVD0, polyVD1; + + z4 = _mm_mul_ps(z2, z2); + + polyVD1 = _mm_mul_ps(VD3, z4); + polyVD0 = _mm_mul_ps(VD2, z4); + polyVD1 = _mm_add_ps(polyVD1, VD1); + polyVD0 = _mm_add_ps(polyVD0, VD0); + polyVD1 = _mm_mul_ps(polyVD1, z2); + polyVD0 = _mm_add_ps(polyVD0, polyVD1); + + polyVD0 = gmx_mm_inv_ps(polyVD0); + + polyVN0 = _mm_mul_ps(VN6, z4); + polyVN1 = _mm_mul_ps(VN5, z4); + polyVN0 = _mm_add_ps(polyVN0, VN4); + polyVN1 = _mm_add_ps(polyVN1, VN3); + polyVN0 = _mm_mul_ps(polyVN0, z4); + polyVN1 = _mm_mul_ps(polyVN1, z4); + polyVN0 = _mm_add_ps(polyVN0, VN2); + polyVN1 = _mm_add_ps(polyVN1, VN1); + polyVN0 = _mm_mul_ps(polyVN0, z4); + polyVN1 = _mm_mul_ps(polyVN1, z2); + polyVN0 = _mm_add_ps(polyVN0, VN0); + polyVN0 = _mm_add_ps(polyVN0, polyVN1); + + return _mm_mul_ps(polyVN0, polyVD0); +} + + +static int +gmx_mm_sincos_ps(__m128 x, + __m128 *sinval, + __m128 *cosval) +{ + const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI); + const __m128 half = _mm_set1_ps(0.5); + const __m128 one = _mm_set1_ps(1.0); + + const __m128i izero = _mm_set1_epi32(0); + const __m128i ione = _mm_set1_epi32(1); + const __m128i itwo = _mm_set1_epi32(2); + const __m128i ithree = _mm_set1_epi32(3); + const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); + + const __m128 CA1 = _mm_set1_ps(1.5703125f); + const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f); + const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f); + + const __m128 CC0 = _mm_set1_ps(-0.0013602249f); + const __m128 CC1 = _mm_set1_ps(0.0416566950f); + const __m128 CC2 = _mm_set1_ps(-0.4999990225f); + const __m128 CS0 = _mm_set1_ps(-0.0001950727f); + const __m128 CS1 = _mm_set1_ps(0.0083320758f); + const __m128 CS2 = _mm_set1_ps(-0.1666665247f); + + __m128 y, y2; + __m128 z; + __m128i iz; + __m128i offset_sin, offset_cos; + __m128 tmp1, tmp2; + __m128 mask_sin, mask_cos; + __m128 tmp_sin, tmp_cos; + + y = _mm_mul_ps(x, two_over_pi); + y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half)); + + iz = _mm_cvttps_epi32(y); + z = _mm_round_ps(y, _MM_FROUND_TO_ZERO); + + offset_sin = _mm_and_si128(iz, ithree); + offset_cos = _mm_add_epi32(iz, ione); + + /* Extended precision arithmethic to achieve full precision */ + y = _mm_mul_ps(z, CA1); + tmp1 = _mm_mul_ps(z, CA2); + tmp2 = _mm_mul_ps(z, CA3); + y = _mm_sub_ps(x, y); + y = _mm_sub_ps(y, tmp1); + y = _mm_sub_ps(y, tmp2); + + y2 = _mm_mul_ps(y, y); + + tmp1 = _mm_mul_ps(CC0, y2); + tmp1 = _mm_add_ps(tmp1, CC1); + tmp2 = _mm_mul_ps(CS0, y2); + tmp2 = _mm_add_ps(tmp2, CS1); + tmp1 = _mm_mul_ps(tmp1, y2); + tmp1 = _mm_add_ps(tmp1, CC2); + tmp2 = _mm_mul_ps(tmp2, y2); + tmp2 = _mm_add_ps(tmp2, CS2); + + tmp1 = _mm_mul_ps(tmp1, y2); + tmp1 = _mm_add_ps(tmp1, one); + + tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2)); + tmp2 = _mm_add_ps(tmp2, y); + + mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero)); + mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero)); + + tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin); + tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos); + + mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero)); + mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero)); + + tmp1 = _mm_xor_ps(signbit, tmp_sin); + tmp2 = _mm_xor_ps(signbit, tmp_cos); + + *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin); + *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos); + + return 0; +} + +/* + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them + * will then call the sincos() routine and waste a factor 2 in performance! + */ +static __m128 +gmx_mm_sin_ps(__m128 x) +{ + __m128 s, c; + gmx_mm_sincos_ps(x, &s, &c); + return s; +} + +/* + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them + * will then call the sincos() routine and waste a factor 2 in performance! + */ +static __m128 +gmx_mm_cos_ps(__m128 x) +{ + __m128 s, c; + gmx_mm_sincos_ps(x, &s, &c); + return c; +} + + +static __m128 +gmx_mm_tan_ps(__m128 x) +{ + __m128 sinval, cosval; + __m128 tanval; + + gmx_mm_sincos_ps(x, &sinval, &cosval); + + tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval)); + + return tanval; +} + + +static __m128 +gmx_mm_asin_ps(__m128 x) +{ + /* Same algorithm as cephes library */ + const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) ); + const __m128 limitlow = _mm_set1_ps(1e-4f); + const __m128 half = _mm_set1_ps(0.5f); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 halfpi = _mm_set1_ps(M_PI/2.0f); + + const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f); + const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f); + const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f); + const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f); + const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f); + + __m128 sign; + __m128 mask; + __m128 xabs; + __m128 z, z1, z2, q, q1, q2; + __m128 pA, pB; + + sign = _mm_andnot_ps(signmask, x); + xabs = _mm_and_ps(x, signmask); + + mask = _mm_cmpgt_ps(xabs, half); + + z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs)); + q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1)); + q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1); + + q2 = xabs; + z2 = _mm_mul_ps(q2, q2); + + z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) ); + q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) ); + + z2 = _mm_mul_ps(z, z); + + pA = _mm_mul_ps(CC5, z2); + pB = _mm_mul_ps(CC4, z2); + + pA = _mm_add_ps(pA, CC3); + pB = _mm_add_ps(pB, CC2); + + pA = _mm_mul_ps(pA, z2); + pB = _mm_mul_ps(pB, z2); + + pA = _mm_add_ps(pA, CC1); + pA = _mm_mul_ps(pA, z); + + z = _mm_add_ps(pA, pB); + z = _mm_mul_ps(z, q); + z = _mm_add_ps(z, q); + + q2 = _mm_sub_ps(halfpi, z); + q2 = _mm_sub_ps(q2, z); + + z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) ); + + mask = _mm_cmpgt_ps(xabs, limitlow); + z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) ); + + z = _mm_xor_ps(z, sign); + + return z; +} + + +static __m128 +gmx_mm_acos_ps(__m128 x) +{ + const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) ); + const __m128 one_ps = _mm_set1_ps(1.0f); + const __m128 half_ps = _mm_set1_ps(0.5f); + const __m128 pi_ps = _mm_set1_ps(M_PI); + const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f); + + __m128 mask1; + __m128 mask2; + __m128 xabs; + __m128 z, z1, z2, z3; + + xabs = _mm_and_ps(x, signmask); + mask1 = _mm_cmpgt_ps(xabs, half_ps); + mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps()); + + z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs)); + z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z)); + z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z); + + z = _mm_blendv_ps(x, z, mask1); + z = gmx_mm_asin_ps(z); + + z2 = _mm_add_ps(z, z); + z1 = _mm_sub_ps(pi_ps, z2); + z3 = _mm_sub_ps(halfpi_ps, z); + + z = _mm_blendv_ps(z1, z2, mask2); + z = _mm_blendv_ps(z3, z, mask1); + + return z; +} + + +static __m128 +gmx_mm_atan_ps(__m128 x) +{ + /* Same algorithm as cephes library */ + const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) ); + const __m128 limit1 = _mm_set1_ps(0.414213562373095f); + const __m128 limit2 = _mm_set1_ps(2.414213562373095f); + const __m128 quarterpi = _mm_set1_ps(0.785398163397448f); + const __m128 halfpi = _mm_set1_ps(1.570796326794896f); + const __m128 mone = _mm_set1_ps(-1.0f); + const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f); + const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f); + const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1); + const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f); + + __m128 sign; + __m128 mask1, mask2; + __m128 y, z1, z2; + __m128 x2, x4; + __m128 sum1, sum2; + + sign = _mm_andnot_ps(signmask, x); + x = _mm_and_ps(x, signmask); + + mask1 = _mm_cmpgt_ps(x, limit1); + mask2 = _mm_cmpgt_ps(x, limit2); + + z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone))); + z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x)); + + y = _mm_and_ps(mask1, quarterpi); + y = _mm_blendv_ps(y, halfpi, mask2); + + x = _mm_blendv_ps(x, z1, mask1); + x = _mm_blendv_ps(x, z2, mask2); + + x2 = _mm_mul_ps(x, x); + x4 = _mm_mul_ps(x2, x2); + + sum1 = _mm_mul_ps(CC9, x4); + sum2 = _mm_mul_ps(CC7, x4); + sum1 = _mm_add_ps(sum1, CC5); + sum2 = _mm_add_ps(sum2, CC3); + sum1 = _mm_mul_ps(sum1, x4); + sum2 = _mm_mul_ps(sum2, x2); + + sum1 = _mm_add_ps(sum1, sum2); + sum1 = _mm_sub_ps(sum1, mone); + sum1 = _mm_mul_ps(sum1, x); + y = _mm_add_ps(y, sum1); + + y = _mm_xor_ps(y, sign); + + return y; +} + + +static __m128 +gmx_mm_atan2_ps(__m128 y, __m128 x) +{ + const __m128 pi = _mm_set1_ps(M_PI); + const __m128 minuspi = _mm_set1_ps(-M_PI); + const __m128 halfpi = _mm_set1_ps(M_PI/2.0); + const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0); + + __m128 z, z1, z3, z4; + __m128 w; + __m128 maskx_lt, maskx_eq; + __m128 masky_lt, masky_eq; + __m128 mask1, mask2, mask3, mask4, maskall; + + maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps()); + masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps()); + maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps()); + masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps()); + + z = _mm_mul_ps(y, gmx_mm_inv_ps(x)); + z = gmx_mm_atan_ps(z); + + mask1 = _mm_and_ps(maskx_eq, masky_lt); + mask2 = _mm_andnot_ps(maskx_lt, masky_eq); + mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq); + mask4 = _mm_and_ps(masky_eq, maskx_lt); + + maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) ); + + z = _mm_andnot_ps(maskall, z); + z1 = _mm_and_ps(mask1, minushalfpi); + z3 = _mm_and_ps(mask3, halfpi); + z4 = _mm_and_ps(mask4, pi); + + z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) ); + + mask1 = _mm_andnot_ps(masky_lt, maskx_lt); + mask2 = _mm_and_ps(maskx_lt, masky_lt); + + w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) ); + w = _mm_andnot_ps(maskall, w); + + z = _mm_add_ps(z, w); + + return z; +} + + + +#endif /* _gmx_math_x86_sse4_1_single_h_ */ diff --cc src/gromacs/mdlib/domdec.c index f7b0479d70,0000000000..a713fafc3d mode 100644,000000..100644 --- a/src/gromacs/mdlib/domdec.c +++ b/src/gromacs/mdlib/domdec.c @@@ -1,9723 -1,0 +1,9726 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This file is part of Gromacs Copyright (c) 1991-2008 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include "typedefs.h" +#include "smalloc.h" +#include "gmx_fatal.h" +#include "gmx_fatal_collective.h" +#include "vec.h" +#include "domdec.h" +#include "domdec_network.h" +#include "nrnb.h" +#include "pbc.h" +#include "chargegroup.h" +#include "constr.h" +#include "mdatoms.h" +#include "names.h" +#include "pdbio.h" +#include "futil.h" +#include "force.h" +#include "pme.h" +#include "pull.h" +#include "pull_rotation.h" +#include "gmx_wallcycle.h" +#include "mdrun.h" +#include "nsgrid.h" +#include "shellfc.h" +#include "mtop_util.h" +#include "gmxfio.h" +#include "gmx_ga2la.h" +#include "gmx_sort.h" +#include "macros.h" +#include "nbnxn_search.h" +#include "bondf.h" +#include "gmx_omp_nthreads.h" + +#include "gromacs/utility/gmxmpi.h" + +#define DDRANK(dd, rank) (rank) +#define DDMASTERRANK(dd) (dd->masterrank) + +typedef struct gmx_domdec_master +{ + /* The cell boundaries */ + real **cell_x; + /* The global charge group division */ + int *ncg; /* Number of home charge groups for each node */ + int *index; /* Index of nnodes+1 into cg */ + int *cg; /* Global charge group index */ + int *nat; /* Number of home atoms for each node. */ + int *ibuf; /* Buffer for communication */ + rvec *vbuf; /* Buffer for state scattering and gathering */ +} gmx_domdec_master_t; + +typedef struct +{ + /* The numbers of charge groups to send and receive for each cell + * that requires communication, the last entry contains the total + * number of atoms that needs to be communicated. + */ + int nsend[DD_MAXIZONE+2]; + int nrecv[DD_MAXIZONE+2]; + /* The charge groups to send */ + int *index; + int nalloc; + /* The atom range for non-in-place communication */ + int cell2at0[DD_MAXIZONE]; + int cell2at1[DD_MAXIZONE]; +} gmx_domdec_ind_t; + +typedef struct +{ + int np; /* Number of grid pulses in this dimension */ + int np_dlb; /* For dlb, for use with edlbAUTO */ + gmx_domdec_ind_t *ind; /* The indices to communicate, size np */ + int np_nalloc; + gmx_bool bInPlace; /* Can we communicate in place? */ +} gmx_domdec_comm_dim_t; + +typedef struct +{ + gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */ + real *cell_f; /* State var.: cell boundaries, box relative */ + real *old_cell_f; /* Temp. var.: old cell size */ + real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */ + real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */ + real *bound_min; /* Temp. var.: lower limit for cell boundary */ + real *bound_max; /* Temp. var.: upper limit for cell boundary */ + gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */ + real *buf_ncd; /* Temp. var. */ +} gmx_domdec_root_t; + +#define DD_NLOAD_MAX 9 + +/* Here floats are accurate enough, since these variables + * only influence the load balancing, not the actual MD results. + */ +typedef struct +{ + int nload; + float *load; + float sum; + float max; + float sum_m; + float cvol_min; + float mdf; + float pme; + int flags; +} gmx_domdec_load_t; + +typedef struct +{ + int nsc; + int ind_gl; + int ind; +} gmx_cgsort_t; + +typedef struct +{ + gmx_cgsort_t *sort; + gmx_cgsort_t *sort2; + int sort_nalloc; + gmx_cgsort_t *sort_new; + int sort_new_nalloc; + int *ibuf; + int ibuf_nalloc; +} gmx_domdec_sort_t; + +typedef struct +{ + rvec *v; + int nalloc; +} vec_rvec_t; + +/* This enum determines the order of the coordinates. + * ddnatHOME and ddnatZONE should be first and second, + * the others can be ordered as wanted. + */ +enum { + ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR +}; + +enum { + edlbAUTO, edlbNO, edlbYES, edlbNR +}; +const char *edlb_names[edlbNR] = { "auto", "no", "yes" }; + +typedef struct +{ + int dim; /* The dimension */ + gmx_bool dim_match; /* Tells if DD and PME dims match */ + int nslab; /* The number of PME slabs in this dimension */ + real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */ + int *pp_min; /* The minimum pp node location, size nslab */ + int *pp_max; /* The maximum pp node location,size nslab */ + int maxshift; /* The maximum shift for coordinate redistribution in PME */ +} gmx_ddpme_t; + +typedef struct +{ + real min0; /* The minimum bottom of this zone */ + real max1; /* The maximum top of this zone */ + real min1; /* The minimum top of this zone */ + real mch0; /* The maximum bottom communicaton height for this zone */ + real mch1; /* The maximum top communicaton height for this zone */ + real p1_0; /* The bottom value of the first cell in this zone */ + real p1_1; /* The top value of the first cell in this zone */ +} gmx_ddzone_t; + +typedef struct +{ + gmx_domdec_ind_t ind; + int *ibuf; + int ibuf_nalloc; + vec_rvec_t vbuf; + int nsend; + int nat; + int nsend_zone; +} dd_comm_setup_work_t; + +typedef struct gmx_domdec_comm +{ + /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing), + * unless stated otherwise. + */ + + /* The number of decomposition dimensions for PME, 0: no PME */ + int npmedecompdim; + /* The number of nodes doing PME (PP/PME or only PME) */ + int npmenodes; + int npmenodes_x; + int npmenodes_y; + /* The communication setup including the PME only nodes */ + gmx_bool bCartesianPP_PME; + ivec ntot; + int cartpmedim; + int *pmenodes; /* size npmenodes */ + int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP + * but with bCartesianPP_PME */ + gmx_ddpme_t ddpme[2]; + + /* The DD particle-particle nodes only */ + gmx_bool bCartesianPP; + int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */ + + /* The global charge groups */ + t_block cgs_gl; + + /* Should we sort the cgs */ + int nstSortCG; + gmx_domdec_sort_t *sort; + + /* Are there charge groups? */ + gmx_bool bCGs; + + /* Are there bonded and multi-body interactions between charge groups? */ + gmx_bool bInterCGBondeds; + gmx_bool bInterCGMultiBody; + + /* Data for the optional bonded interaction atom communication range */ + gmx_bool bBondComm; + t_blocka *cglink; + char *bLocalCG; + + /* The DLB option */ + int eDLB; + /* Are we actually using DLB? */ + gmx_bool bDynLoadBal; + + /* Cell sizes for static load balancing, first index cartesian */ + real **slb_frac; + + /* The width of the communicated boundaries */ + real cutoff_mbody; + real cutoff; + /* The minimum cell size (including triclinic correction) */ + rvec cellsize_min; + /* For dlb, for use with edlbAUTO */ + rvec cellsize_min_dlb; + /* The lower limit for the DD cell size with DLB */ + real cellsize_limit; + /* Effectively no NB cut-off limit with DLB for systems without PBC? */ + gmx_bool bVacDLBNoLimit; + + /* With PME load balancing we set limits on DLB */ + gmx_bool bPMELoadBalDLBLimits; + /* DLB needs to take into account that we want to allow this maximum + * cut-off (for PME load balancing), this could limit cell boundaries. + */ + real PMELoadBal_max_cutoff; + + /* tric_dir is only stored here because dd_get_ns_ranges needs it */ + ivec tric_dir; + /* box0 and box_size are required with dim's without pbc and -gcom */ + rvec box0; + rvec box_size; + + /* The cell boundaries */ + rvec cell_x0; + rvec cell_x1; + + /* The old location of the cell boundaries, to check cg displacements */ + rvec old_cell_x0; + rvec old_cell_x1; + + /* The communication setup and charge group boundaries for the zones */ + gmx_domdec_zones_t zones; + + /* The zone limits for DD dimensions 1 and 2 (not 0), determined from + * cell boundaries of neighboring cells for dynamic load balancing. + */ + gmx_ddzone_t zone_d1[2]; + gmx_ddzone_t zone_d2[2][2]; + + /* The coordinate/force communication setup and indices */ + gmx_domdec_comm_dim_t cd[DIM]; + /* The maximum number of cells to communicate with in one dimension */ + int maxpulse; + + /* Which cg distribution is stored on the master node */ + int master_cg_ddp_count; + + /* The number of cg's received from the direct neighbors */ + int zone_ncg1[DD_MAXZONE]; + + /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */ + int nat[ddnatNR]; + + /* Array for signalling if atoms have moved to another domain */ + int *moved; + int moved_nalloc; + + /* Communication buffer for general use */ + int *buf_int; + int nalloc_int; + + /* Communication buffer for general use */ + vec_rvec_t vbuf; + + /* Temporary storage for thread parallel communication setup */ + int nth; + dd_comm_setup_work_t *dth; + + /* Communication buffers only used with multiple grid pulses */ + int *buf_int2; + int nalloc_int2; + vec_rvec_t vbuf2; + + /* Communication buffers for local redistribution */ + int **cggl_flag; + int cggl_flag_nalloc[DIM*2]; + rvec **cgcm_state; + int cgcm_state_nalloc[DIM*2]; + + /* Cell sizes for dynamic load balancing */ + gmx_domdec_root_t **root; + real *cell_f_row; + real cell_f0[DIM]; + real cell_f1[DIM]; + real cell_f_max0[DIM]; + real cell_f_min1[DIM]; + + /* Stuff for load communication */ + gmx_bool bRecordLoad; + gmx_domdec_load_t *load; +#ifdef GMX_MPI + MPI_Comm *mpi_comm_load; +#endif + + /* Maximum DLB scaling per load balancing step in percent */ + int dlb_scale_lim; + + /* Cycle counters */ + float cycl[ddCyclNr]; + int cycl_n[ddCyclNr]; + float cycl_max[ddCyclNr]; + /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */ + int eFlop; + double flop; + int flop_n; + /* Have often have did we have load measurements */ + int n_load_have; + /* Have often have we collected the load measurements */ + int n_load_collect; + + /* Statistics */ + double sum_nat[ddnatNR-ddnatZONE]; + int ndecomp; + int nload; + double load_step; + double load_sum; + double load_max; + ivec load_lim; + double load_mdf; + double load_pme; + + /* The last partition step */ + gmx_large_int_t partition_step; + + /* Debugging */ + int nstDDDump; + int nstDDDumpGrid; + int DD_debug; +} gmx_domdec_comm_t; + +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */ +#define DD_CGIBS 2 + +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */ +#define DD_FLAG_NRCG 65535 +#define DD_FLAG_FW(d) (1<<(16+(d)*2)) +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1)) + +/* Zone permutation required to obtain consecutive charge groups + * for neighbor searching. + */ +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} }; + +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero + * components see only j zones with that component 0. + */ + +/* The DD zone order */ +static const ivec dd_zo[DD_MAXZONE] = +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}}; + +/* The 3D setup */ +#define dd_z3n 8 +#define dd_zp3n 4 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}}; + +/* The 2D setup */ +#define dd_z2n 4 +#define dd_zp2n 2 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}}; + +/* The 1D setup */ +#define dd_z1n 2 +#define dd_zp1n 1 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}}; + +/* Factors used to avoid problems due to rounding issues */ +#define DD_CELL_MARGIN 1.0001 +#define DD_CELL_MARGIN2 1.00005 +/* Factor to account for pressure scaling during nstlist steps */ +#define DD_PRES_SCALE_MARGIN 1.02 + +/* Allowed performance loss before we DLB or warn */ +#define DD_PERF_LOSS 0.05 + +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di)) + +/* Use separate MPI send and receive commands + * when nnodes <= GMX_DD_NNODES_SENDRECV. + * This saves memory (and some copying for small nnodes). + * For high parallelization scatter and gather calls are used. + */ +#define GMX_DD_NNODES_SENDRECV 4 + + +/* + #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX]) + + static void index2xyz(ivec nc,int ind,ivec xyz) + { + xyz[XX] = ind % nc[XX]; + xyz[YY] = (ind / nc[XX]) % nc[YY]; + xyz[ZZ] = ind / (nc[YY]*nc[XX]); + } + */ + +/* This order is required to minimize the coordinate communication in PME + * which uses decomposition in the x direction. + */ +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ]) + +static void ddindex2xyz(ivec nc, int ind, ivec xyz) +{ + xyz[XX] = ind / (nc[YY]*nc[ZZ]); + xyz[YY] = (ind / nc[ZZ]) % nc[YY]; + xyz[ZZ] = ind % nc[ZZ]; +} + +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c) +{ + int ddindex; + int ddnodeid = -1; + + ddindex = dd_index(dd->nc, c); + if (dd->comm->bCartesianPP_PME) + { + ddnodeid = dd->comm->ddindex2ddnodeid[ddindex]; + } + else if (dd->comm->bCartesianPP) + { +#ifdef GMX_MPI + MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid); +#endif + } + else + { + ddnodeid = ddindex; + } + + return ddnodeid; +} + +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir) +{ + return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir)); +} + +int ddglatnr(gmx_domdec_t *dd, int i) +{ + int atnr; + + if (dd == NULL) + { + atnr = i + 1; + } + else + { + if (i >= dd->comm->nat[ddnatNR-1]) + { + gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]); + } + atnr = dd->gatindex[i] + 1; + } + + return atnr; +} + +t_block *dd_charge_groups_global(gmx_domdec_t *dd) +{ + return &dd->comm->cgs_gl; +} + +static void vec_rvec_init(vec_rvec_t *v) +{ + v->nalloc = 0; + v->v = NULL; +} + +static void vec_rvec_check_alloc(vec_rvec_t *v, int n) +{ + if (n > v->nalloc) + { + v->nalloc = over_alloc_dd(n); + srenew(v->v, v->nalloc); + } +} + +void dd_store_state(gmx_domdec_t *dd, t_state *state) +{ + int i; + + if (state->ddp_count != dd->ddp_count) + { + gmx_incons("The state does not the domain decomposition state"); + } + + state->ncg_gl = dd->ncg_home; + if (state->ncg_gl > state->cg_gl_nalloc) + { + state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl); + srenew(state->cg_gl, state->cg_gl_nalloc); + } + for (i = 0; i < state->ncg_gl; i++) + { + state->cg_gl[i] = dd->index_gl[i]; + } + + state->ddp_count_cg_gl = dd->ddp_count; +} + +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd) +{ + return &dd->comm->zones; +} + +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg, + int *jcg0, int *jcg1, ivec shift0, ivec shift1) +{ + gmx_domdec_zones_t *zones; + int izone, d, dim; + + zones = &dd->comm->zones; + + izone = 0; + while (icg >= zones->izone[izone].cg1) + { + izone++; + } + + if (izone == 0) + { + *jcg0 = icg; + } + else if (izone < zones->nizone) + { + *jcg0 = zones->izone[izone].jcg0; + } + else + { + gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)", + icg, izone, zones->nizone); + } + + *jcg1 = zones->izone[izone].jcg1; + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + shift0[dim] = zones->izone[izone].shift0[dim]; + shift1[dim] = zones->izone[izone].shift1[dim]; + if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0)) + { + /* A conservative approach, this can be optimized */ + shift0[dim] -= 1; + shift1[dim] += 1; + } + } +} + +int dd_natoms_vsite(gmx_domdec_t *dd) +{ + return dd->comm->nat[ddnatVSITE]; +} + +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end) +{ + *at_start = dd->comm->nat[ddnatCON-1]; + *at_end = dd->comm->nat[ddnatCON]; +} + +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[]) +{ + int nzone, nat_tot, n, d, p, i, j, at0, at1, zone; + int *index, *cgindex; + gmx_domdec_comm_t *comm; + gmx_domdec_comm_dim_t *cd; + gmx_domdec_ind_t *ind; + rvec shift = {0, 0, 0}, *buf, *rbuf; + gmx_bool bPBC, bScrew; + + comm = dd->comm; + + cgindex = dd->cgindex; + + buf = comm->vbuf.v; + + nzone = 1; + nat_tot = dd->nat_home; + for (d = 0; d < dd->ndim; d++) + { + bPBC = (dd->ci[dd->dim[d]] == 0); + bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX); + if (bPBC) + { + copy_rvec(box[dd->dim[d]], shift); + } + cd = &comm->cd[d]; + for (p = 0; p < cd->np; p++) + { + ind = &cd->ind[p]; + index = ind->index; + n = 0; + if (!bPBC) + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + copy_rvec(x[j], buf[n]); + n++; + } + } + } + else if (!bScrew) + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + /* We need to shift the coordinates */ + rvec_add(x[j], shift, buf[n]); + n++; + } + } + } + else + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + /* Shift x */ + buf[n][XX] = x[j][XX] + shift[XX]; + /* Rotate y and z. + * This operation requires a special shift force + * treatment, which is performed in calc_vir. + */ + buf[n][YY] = box[YY][YY] - x[j][YY]; + buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ]; + n++; + } + } + } + + if (cd->bInPlace) + { + rbuf = x + nat_tot; + } + else + { + rbuf = comm->vbuf2.v; + } + /* Send and receive the coordinates */ + dd_sendrecv_rvec(dd, d, dddirBackward, + buf, ind->nsend[nzone+1], + rbuf, ind->nrecv[nzone+1]); + if (!cd->bInPlace) + { + j = 0; + for (zone = 0; zone < nzone; zone++) + { + for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++) + { + copy_rvec(rbuf[j], x[i]); + j++; + } + } + } + nat_tot += ind->nrecv[nzone+1]; + } + nzone += nzone; + } +} + +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift) +{ + int nzone, nat_tot, n, d, p, i, j, at0, at1, zone; + int *index, *cgindex; + gmx_domdec_comm_t *comm; + gmx_domdec_comm_dim_t *cd; + gmx_domdec_ind_t *ind; + rvec *buf, *sbuf; + ivec vis; + int is; + gmx_bool bPBC, bScrew; + + comm = dd->comm; + + cgindex = dd->cgindex; + + buf = comm->vbuf.v; + + n = 0; + nzone = comm->zones.n/2; + nat_tot = dd->nat_tot; + for (d = dd->ndim-1; d >= 0; d--) + { + bPBC = (dd->ci[dd->dim[d]] == 0); + bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX); + if (fshift == NULL && !bScrew) + { + bPBC = FALSE; + } + /* Determine which shift vector we need */ + clear_ivec(vis); + vis[dd->dim[d]] = 1; + is = IVEC2IS(vis); + + cd = &comm->cd[d]; + for (p = cd->np-1; p >= 0; p--) + { + ind = &cd->ind[p]; + nat_tot -= ind->nrecv[nzone+1]; + if (cd->bInPlace) + { + sbuf = f + nat_tot; + } + else + { + sbuf = comm->vbuf2.v; + j = 0; + for (zone = 0; zone < nzone; zone++) + { + for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++) + { + copy_rvec(f[i], sbuf[j]); + j++; + } + } + } + /* Communicate the forces */ + dd_sendrecv_rvec(dd, d, dddirForward, + sbuf, ind->nrecv[nzone+1], + buf, ind->nsend[nzone+1]); + index = ind->index; + /* Add the received forces */ + n = 0; + if (!bPBC) + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + rvec_inc(f[j], buf[n]); + n++; + } + } + } + else if (!bScrew) + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + rvec_inc(f[j], buf[n]); + /* Add this force to the shift force */ + rvec_inc(fshift[is], buf[n]); + n++; + } + } + } + else + { + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + /* Rotate the force */ + f[j][XX] += buf[n][XX]; + f[j][YY] -= buf[n][YY]; + f[j][ZZ] -= buf[n][ZZ]; + if (fshift) + { + /* Add this force to the shift force */ + rvec_inc(fshift[is], buf[n]); + } + n++; + } + } + } + } + nzone /= 2; + } +} + +void dd_atom_spread_real(gmx_domdec_t *dd, real v[]) +{ + int nzone, nat_tot, n, d, p, i, j, at0, at1, zone; + int *index, *cgindex; + gmx_domdec_comm_t *comm; + gmx_domdec_comm_dim_t *cd; + gmx_domdec_ind_t *ind; + real *buf, *rbuf; + + comm = dd->comm; + + cgindex = dd->cgindex; + + buf = &comm->vbuf.v[0][0]; + + nzone = 1; + nat_tot = dd->nat_home; + for (d = 0; d < dd->ndim; d++) + { + cd = &comm->cd[d]; + for (p = 0; p < cd->np; p++) + { + ind = &cd->ind[p]; + index = ind->index; + n = 0; + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + buf[n] = v[j]; + n++; + } + } + + if (cd->bInPlace) + { + rbuf = v + nat_tot; + } + else + { + rbuf = &comm->vbuf2.v[0][0]; + } + /* Send and receive the coordinates */ + dd_sendrecv_real(dd, d, dddirBackward, + buf, ind->nsend[nzone+1], + rbuf, ind->nrecv[nzone+1]); + if (!cd->bInPlace) + { + j = 0; + for (zone = 0; zone < nzone; zone++) + { + for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++) + { + v[i] = rbuf[j]; + j++; + } + } + } + nat_tot += ind->nrecv[nzone+1]; + } + nzone += nzone; + } +} + +void dd_atom_sum_real(gmx_domdec_t *dd, real v[]) +{ + int nzone, nat_tot, n, d, p, i, j, at0, at1, zone; + int *index, *cgindex; + gmx_domdec_comm_t *comm; + gmx_domdec_comm_dim_t *cd; + gmx_domdec_ind_t *ind; + real *buf, *sbuf; + + comm = dd->comm; + + cgindex = dd->cgindex; + + buf = &comm->vbuf.v[0][0]; + + n = 0; + nzone = comm->zones.n/2; + nat_tot = dd->nat_tot; + for (d = dd->ndim-1; d >= 0; d--) + { + cd = &comm->cd[d]; + for (p = cd->np-1; p >= 0; p--) + { + ind = &cd->ind[p]; + nat_tot -= ind->nrecv[nzone+1]; + if (cd->bInPlace) + { + sbuf = v + nat_tot; + } + else + { + sbuf = &comm->vbuf2.v[0][0]; + j = 0; + for (zone = 0; zone < nzone; zone++) + { + for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++) + { + sbuf[j] = v[i]; + j++; + } + } + } + /* Communicate the forces */ + dd_sendrecv_real(dd, d, dddirForward, + sbuf, ind->nrecv[nzone+1], + buf, ind->nsend[nzone+1]); + index = ind->index; + /* Add the received forces */ + n = 0; + for (i = 0; i < ind->nsend[nzone]; i++) + { + at0 = cgindex[index[i]]; + at1 = cgindex[index[i]+1]; + for (j = at0; j < at1; j++) + { + v[j] += buf[n]; + n++; + } + } + } + nzone /= 2; + } +} + +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone) +{ + fprintf(fp, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n", + d, i, j, + zone->min0, zone->max1, + zone->mch0, zone->mch0, + zone->p1_0, zone->p1_1); +} + + +#define DDZONECOMM_MAXZONE 5 +#define DDZONECOMM_BUFSIZE 3 + +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd, + int ddimind, int direction, + gmx_ddzone_t *buf_s, int n_s, + gmx_ddzone_t *buf_r, int n_r) +{ +#define ZBS DDZONECOMM_BUFSIZE + rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS]; + rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS]; + int i; + + for (i = 0; i < n_s; i++) + { + vbuf_s[i*ZBS ][0] = buf_s[i].min0; + vbuf_s[i*ZBS ][1] = buf_s[i].max1; + vbuf_s[i*ZBS ][2] = buf_s[i].min1; + vbuf_s[i*ZBS+1][0] = buf_s[i].mch0; + vbuf_s[i*ZBS+1][1] = buf_s[i].mch1; + vbuf_s[i*ZBS+1][2] = 0; + vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0; + vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1; + vbuf_s[i*ZBS+2][2] = 0; + } + + dd_sendrecv_rvec(dd, ddimind, direction, + vbuf_s, n_s*ZBS, + vbuf_r, n_r*ZBS); + + for (i = 0; i < n_r; i++) + { + buf_r[i].min0 = vbuf_r[i*ZBS ][0]; + buf_r[i].max1 = vbuf_r[i*ZBS ][1]; + buf_r[i].min1 = vbuf_r[i*ZBS ][2]; + buf_r[i].mch0 = vbuf_r[i*ZBS+1][0]; + buf_r[i].mch1 = vbuf_r[i*ZBS+1][1]; + buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0]; + buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1]; + } + +#undef ZBS +} + +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox, + rvec cell_ns_x0, rvec cell_ns_x1) +{ + int d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min; + gmx_ddzone_t *zp; + gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE]; + gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE]; + gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE]; + rvec extr_s[2], extr_r[2]; + rvec dh; + real dist_d, c = 0, det; + gmx_domdec_comm_t *comm; + gmx_bool bPBC, bUse; + + comm = dd->comm; + + for (d = 1; d < dd->ndim; d++) + { + dim = dd->dim[d]; + zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0]; + zp->min0 = cell_ns_x0[dim]; + zp->max1 = cell_ns_x1[dim]; + zp->min1 = cell_ns_x1[dim]; + zp->mch0 = cell_ns_x0[dim]; + zp->mch1 = cell_ns_x1[dim]; + zp->p1_0 = cell_ns_x0[dim]; + zp->p1_1 = cell_ns_x1[dim]; + } + + for (d = dd->ndim-2; d >= 0; d--) + { + dim = dd->dim[d]; + bPBC = (dim < ddbox->npbcdim); + + /* Use an rvec to store two reals */ + extr_s[d][0] = comm->cell_f0[d+1]; + extr_s[d][1] = comm->cell_f1[d+1]; + extr_s[d][2] = comm->cell_f1[d+1]; + + pos = 0; + /* Store the extremes in the backward sending buffer, + * so the get updated separately from the forward communication. + */ + for (d1 = d; d1 < dd->ndim-1; d1++) + { + /* We invert the order to be able to use the same loop for buf_e */ + buf_s[pos].min0 = extr_s[d1][1]; + buf_s[pos].max1 = extr_s[d1][0]; + buf_s[pos].min1 = extr_s[d1][2]; + buf_s[pos].mch0 = 0; + buf_s[pos].mch1 = 0; + /* Store the cell corner of the dimension we communicate along */ + buf_s[pos].p1_0 = comm->cell_x0[dim]; + buf_s[pos].p1_1 = 0; + pos++; + } + + buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0]; + pos++; + + if (dd->ndim == 3 && d == 0) + { + buf_s[pos] = comm->zone_d2[0][1]; + pos++; + buf_s[pos] = comm->zone_d1[0]; + pos++; + } + + /* We only need to communicate the extremes + * in the forward direction + */ + npulse = comm->cd[d].np; + if (bPBC) + { + /* Take the minimum to avoid double communication */ + npulse_min = min(npulse, dd->nc[dim]-1-npulse); + } + else + { + /* Without PBC we should really not communicate over + * the boundaries, but implementing that complicates + * the communication setup and therefore we simply + * do all communication, but ignore some data. + */ + npulse_min = npulse; + } + for (p = 0; p < npulse_min; p++) + { + /* Communicate the extremes forward */ + bUse = (bPBC || dd->ci[dim] > 0); + + dd_sendrecv_rvec(dd, d, dddirForward, + extr_s+d, dd->ndim-d-1, + extr_r+d, dd->ndim-d-1); + + if (bUse) + { + for (d1 = d; d1 < dd->ndim-1; d1++) + { + extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]); + extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]); + extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]); + } + } + } + + buf_size = pos; + for (p = 0; p < npulse; p++) + { + /* Communicate all the zone information backward */ + bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1); + + dd_sendrecv_ddzone(dd, d, dddirBackward, + buf_s, buf_size, + buf_r, buf_size); + + clear_rvec(dh); + if (p > 0) + { + for (d1 = d+1; d1 < dd->ndim; d1++) + { + /* Determine the decrease of maximum required + * communication height along d1 due to the distance along d, + * this avoids a lot of useless atom communication. + */ + dist_d = comm->cell_x1[dim] - buf_r[0].p1_0; + + if (ddbox->tric_dir[dim]) + { + /* c is the off-diagonal coupling between the cell planes + * along directions d and d1. + */ + c = ddbox->v[dim][dd->dim[d1]][dim]; + } + else + { + c = 0; + } + det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d; + if (det > 0) + { + dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c); + } + else + { + /* A negative value signals out of range */ + dh[d1] = -1; + } + } + } + + /* Accumulate the extremes over all pulses */ + for (i = 0; i < buf_size; i++) + { + if (p == 0) + { + buf_e[i] = buf_r[i]; + } + else + { + if (bUse) + { + buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0); + buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1); + buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1); + } + + if (dd->ndim == 3 && d == 0 && i == buf_size - 1) + { + d1 = 1; + } + else + { + d1 = d + 1; + } + if (bUse && dh[d1] >= 0) + { + buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]); + buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]); + } + } + /* Copy the received buffer to the send buffer, + * to pass the data through with the next pulse. + */ + buf_s[i] = buf_r[i]; + } + if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) || + (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1)) + { + /* Store the extremes */ + pos = 0; + + for (d1 = d; d1 < dd->ndim-1; d1++) + { + extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0); + extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1); + extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1); + pos++; + } + + if (d == 1 || (d == 0 && dd->ndim == 3)) + { + for (i = d; i < 2; i++) + { + comm->zone_d2[1-d][i] = buf_e[pos]; + pos++; + } + } + if (d == 0) + { + comm->zone_d1[1] = buf_e[pos]; + pos++; + } + } + } + } + + if (dd->ndim >= 2) + { + dim = dd->dim[1]; + for (i = 0; i < 2; i++) + { + if (debug) + { + print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]); + } + cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0); + cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1); + } + } + if (dd->ndim >= 3) + { + dim = dd->dim[2]; + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (debug) + { + print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]); + } + cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0); + cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1); + } + } + } + for (d = 1; d < dd->ndim; d++) + { + comm->cell_f_max0[d] = extr_s[d-1][0]; + comm->cell_f_min1[d] = extr_s[d-1][1]; + if (debug) + { + fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n", + d, comm->cell_f_max0[d], comm->cell_f_min1[d]); + } + } +} + +static void dd_collect_cg(gmx_domdec_t *dd, + t_state *state_local) +{ + gmx_domdec_master_t *ma = NULL; + int buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0; + t_block *cgs_gl; + + if (state_local->ddp_count == dd->comm->master_cg_ddp_count) + { + /* The master has the correct distribution */ + return; + } + + if (state_local->ddp_count == dd->ddp_count) + { + ncg_home = dd->ncg_home; + cg = dd->index_gl; + nat_home = dd->nat_home; + } + else if (state_local->ddp_count_cg_gl == state_local->ddp_count) + { + cgs_gl = &dd->comm->cgs_gl; + + ncg_home = state_local->ncg_gl; + cg = state_local->cg_gl; + nat_home = 0; + for (i = 0; i < ncg_home; i++) + { + nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]]; + } + } + else + { + gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown"); + } + + buf2[0] = dd->ncg_home; + buf2[1] = dd->nat_home; + if (DDMASTER(dd)) + { + ma = dd->ma; + ibuf = ma->ibuf; + } + else + { + ibuf = NULL; + } + /* Collect the charge group and atom counts on the master */ + dd_gather(dd, 2*sizeof(int), buf2, ibuf); + + if (DDMASTER(dd)) + { + ma->index[0] = 0; + for (i = 0; i < dd->nnodes; i++) + { + ma->ncg[i] = ma->ibuf[2*i]; + ma->nat[i] = ma->ibuf[2*i+1]; + ma->index[i+1] = ma->index[i] + ma->ncg[i]; + + } + /* Make byte counts and indices */ + for (i = 0; i < dd->nnodes; i++) + { + ma->ibuf[i] = ma->ncg[i]*sizeof(int); + ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int); + } + if (debug) + { + fprintf(debug, "Initial charge group distribution: "); + for (i = 0; i < dd->nnodes; i++) + { + fprintf(debug, " %d", ma->ncg[i]); + } + fprintf(debug, "\n"); + } + } + + /* Collect the charge group indices on the master */ + dd_gatherv(dd, + dd->ncg_home*sizeof(int), dd->index_gl, + DDMASTER(dd) ? ma->ibuf : NULL, + DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL, + DDMASTER(dd) ? ma->cg : NULL); + + dd->comm->master_cg_ddp_count = state_local->ddp_count; +} + +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd, + rvec *lv, rvec *v) +{ + gmx_domdec_master_t *ma; + int n, i, c, a, nalloc = 0; + rvec *buf = NULL; + t_block *cgs_gl; + + ma = dd->ma; + + if (!DDMASTER(dd)) + { +#ifdef GMX_MPI + MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd), + dd->rank, dd->mpi_comm_all); +#endif + } + else + { + /* Copy the master coordinates to the global array */ + cgs_gl = &dd->comm->cgs_gl; + + n = DDMASTERRANK(dd); + a = 0; + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++) + { + copy_rvec(lv[a++], v[c]); + } + } + + for (n = 0; n < dd->nnodes; n++) + { + if (n != dd->rank) + { + if (ma->nat[n] > nalloc) + { + nalloc = over_alloc_dd(ma->nat[n]); + srenew(buf, nalloc); + } +#ifdef GMX_MPI + MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n), + n, dd->mpi_comm_all, MPI_STATUS_IGNORE); +#endif + a = 0; + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++) + { + copy_rvec(buf[a++], v[c]); + } + } + } + } + sfree(buf); + } +} + +static void get_commbuffer_counts(gmx_domdec_t *dd, + int **counts, int **disps) +{ + gmx_domdec_master_t *ma; + int n; + + ma = dd->ma; + + /* Make the rvec count and displacment arrays */ + *counts = ma->ibuf; + *disps = ma->ibuf + dd->nnodes; + for (n = 0; n < dd->nnodes; n++) + { + (*counts)[n] = ma->nat[n]*sizeof(rvec); + (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]); + } +} + +static void dd_collect_vec_gatherv(gmx_domdec_t *dd, + rvec *lv, rvec *v) +{ + gmx_domdec_master_t *ma; + int *rcounts = NULL, *disps = NULL; + int n, i, c, a; + rvec *buf = NULL; + t_block *cgs_gl; + + ma = dd->ma; + + if (DDMASTER(dd)) + { + get_commbuffer_counts(dd, &rcounts, &disps); + + buf = ma->vbuf; + } + + dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf); + + if (DDMASTER(dd)) + { + cgs_gl = &dd->comm->cgs_gl; + + a = 0; + for (n = 0; n < dd->nnodes; n++) + { + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++) + { + copy_rvec(buf[a++], v[c]); + } + } + } + } +} + +void dd_collect_vec(gmx_domdec_t *dd, + t_state *state_local, rvec *lv, rvec *v) +{ + gmx_domdec_master_t *ma; + int n, i, c, a, nalloc = 0; + rvec *buf = NULL; + + dd_collect_cg(dd, state_local); + + if (dd->nnodes <= GMX_DD_NNODES_SENDRECV) + { + dd_collect_vec_sendrecv(dd, lv, v); + } + else + { + dd_collect_vec_gatherv(dd, lv, v); + } +} + + +void dd_collect_state(gmx_domdec_t *dd, + t_state *state_local, t_state *state) +{ + int est, i, j, nh; + + nh = state->nhchainlength; + + if (DDMASTER(dd)) + { + for (i = 0; i < efptNR; i++) + { + state->lambda[i] = state_local->lambda[i]; + } + state->fep_state = state_local->fep_state; + state->veta = state_local->veta; + state->vol0 = state_local->vol0; + copy_mat(state_local->box, state->box); + copy_mat(state_local->boxv, state->boxv); + copy_mat(state_local->svir_prev, state->svir_prev); + copy_mat(state_local->fvir_prev, state->fvir_prev); + copy_mat(state_local->pres_prev, state->pres_prev); + + + for (i = 0; i < state_local->ngtc; i++) + { + for (j = 0; j < nh; j++) + { + state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j]; + state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j]; + } + state->therm_integral[i] = state_local->therm_integral[i]; + } + for (i = 0; i < state_local->nnhpres; i++) + { + for (j = 0; j < nh; j++) + { + state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j]; + state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j]; + } + } + } + for (est = 0; est < estNR; est++) + { + if (EST_DISTR(est) && (state_local->flags & (1<x, state->x); + break; + case estV: + dd_collect_vec(dd, state_local, state_local->v, state->v); + break; + case estSDX: + dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X); + break; + case estCGP: + dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p); + break; + case estLD_RNG: + if (state->nrngi == 1) + { + if (DDMASTER(dd)) + { + for (i = 0; i < state_local->nrng; i++) + { + state->ld_rng[i] = state_local->ld_rng[i]; + } + } + } + else + { + dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]), + state_local->ld_rng, state->ld_rng); + } + break; + case estLD_RNGI: + if (state->nrngi == 1) + { + if (DDMASTER(dd)) + { + state->ld_rngi[0] = state_local->ld_rngi[0]; + } + } + else + { + dd_gather(dd, sizeof(state->ld_rngi[0]), + state_local->ld_rngi, state->ld_rngi); + } + break; + case estDISRE_INITF: + case estDISRE_RM3TAV: + case estORIRE_INITF: + case estORIRE_DTAV: + break; + default: + gmx_incons("Unknown state entry encountered in dd_collect_state"); + } + } + } +} + +static void dd_realloc_state(t_state *state, rvec **f, int nalloc) +{ + int est; + + if (debug) + { + fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc)); + } + + state->nalloc = over_alloc_dd(nalloc); + + for (est = 0; est < estNR; est++) + { + if (EST_DISTR(est) && (state->flags & (1<x, state->nalloc); + break; + case estV: + srenew(state->v, state->nalloc); + break; + case estSDX: + srenew(state->sd_X, state->nalloc); + break; + case estCGP: + srenew(state->cg_p, state->nalloc); + break; + case estLD_RNG: + case estLD_RNGI: + case estDISRE_INITF: + case estDISRE_RM3TAV: + case estORIRE_INITF: + case estORIRE_DTAV: + /* No reallocation required */ + break; + default: + gmx_incons("Unknown state entry encountered in dd_realloc_state"); + } + } + } + + if (f != NULL) + { + srenew(*f, state->nalloc); + } +} + +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f, + int nalloc) +{ + if (nalloc > fr->cg_nalloc) + { + if (debug) + { + fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc)); + } + fr->cg_nalloc = over_alloc_dd(nalloc); + srenew(fr->cginfo, fr->cg_nalloc); + if (fr->cutoff_scheme == ecutsGROUP) + { + srenew(fr->cg_cm, fr->cg_nalloc); + } + } + if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc) + { + /* We don't use charge groups, we use x in state to set up + * the atom communication. + */ + dd_realloc_state(state, f, nalloc); + } +} + +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs, + rvec *v, rvec *lv) +{ + gmx_domdec_master_t *ma; + int n, i, c, a, nalloc = 0; + rvec *buf = NULL; + + if (DDMASTER(dd)) + { + ma = dd->ma; + + for (n = 0; n < dd->nnodes; n++) + { + if (n != dd->rank) + { + if (ma->nat[n] > nalloc) + { + nalloc = over_alloc_dd(ma->nat[n]); + srenew(buf, nalloc); + } + /* Use lv as a temporary buffer */ + a = 0; + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++) + { + copy_rvec(v[c], buf[a++]); + } + } + if (a != ma->nat[n]) + { + gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)", + a, ma->nat[n]); + } + +#ifdef GMX_MPI + MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, + DDRANK(dd, n), n, dd->mpi_comm_all); +#endif + } + } + sfree(buf); + n = DDMASTERRANK(dd); + a = 0; + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++) + { + copy_rvec(v[c], lv[a++]); + } + } + } + else + { +#ifdef GMX_MPI + MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd), + MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE); +#endif + } +} + +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs, + rvec *v, rvec *lv) +{ + gmx_domdec_master_t *ma; + int *scounts = NULL, *disps = NULL; + int n, i, c, a, nalloc = 0; + rvec *buf = NULL; + + if (DDMASTER(dd)) + { + ma = dd->ma; + + get_commbuffer_counts(dd, &scounts, &disps); + + buf = ma->vbuf; + a = 0; + for (n = 0; n < dd->nnodes; n++) + { + for (i = ma->index[n]; i < ma->index[n+1]; i++) + { + for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++) + { + copy_rvec(v[c], buf[a++]); + } + } + } + } + + dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv); +} + +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv) +{ + if (dd->nnodes <= GMX_DD_NNODES_SENDRECV) + { + dd_distribute_vec_sendrecv(dd, cgs, v, lv); + } + else + { + dd_distribute_vec_scatterv(dd, cgs, v, lv); + } +} + +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs, + t_state *state, t_state *state_local, + rvec **f) +{ + int i, j, nh; + + nh = state->nhchainlength; + + if (DDMASTER(dd)) + { + for (i = 0; i < efptNR; i++) + { + state_local->lambda[i] = state->lambda[i]; + } + state_local->fep_state = state->fep_state; + state_local->veta = state->veta; + state_local->vol0 = state->vol0; + copy_mat(state->box, state_local->box); + copy_mat(state->box_rel, state_local->box_rel); + copy_mat(state->boxv, state_local->boxv); + copy_mat(state->svir_prev, state_local->svir_prev); + copy_mat(state->fvir_prev, state_local->fvir_prev); + for (i = 0; i < state_local->ngtc; i++) + { + for (j = 0; j < nh; j++) + { + state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j]; + state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j]; + } + state_local->therm_integral[i] = state->therm_integral[i]; + } + for (i = 0; i < state_local->nnhpres; i++) + { + for (j = 0; j < nh; j++) + { + state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j]; + state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j]; + } + } + } + dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda); + dd_bcast(dd, sizeof(int), &state_local->fep_state); + dd_bcast(dd, sizeof(real), &state_local->veta); + dd_bcast(dd, sizeof(real), &state_local->vol0); + dd_bcast(dd, sizeof(state_local->box), state_local->box); + dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel); + dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv); + dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev); + dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev); + dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi); + dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi); + dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral); + dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi); + dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi); + + if (dd->nat_home > state_local->nalloc) + { + dd_realloc_state(state_local, f, dd->nat_home); + } + for (i = 0; i < estNR; i++) + { + if (EST_DISTR(i) && (state_local->flags & (1<x, state_local->x); + break; + case estV: + dd_distribute_vec(dd, cgs, state->v, state_local->v); + break; + case estSDX: + dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X); + break; + case estCGP: + dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p); + break; + case estLD_RNG: + if (state->nrngi == 1) + { + dd_bcastc(dd, + state_local->nrng*sizeof(state_local->ld_rng[0]), + state->ld_rng, state_local->ld_rng); + } + else + { + dd_scatter(dd, + state_local->nrng*sizeof(state_local->ld_rng[0]), + state->ld_rng, state_local->ld_rng); + } + break; + case estLD_RNGI: + if (state->nrngi == 1) + { + dd_bcastc(dd, sizeof(state_local->ld_rngi[0]), + state->ld_rngi, state_local->ld_rngi); + } + else + { + dd_scatter(dd, sizeof(state_local->ld_rngi[0]), + state->ld_rngi, state_local->ld_rngi); + } + break; + case estDISRE_INITF: + case estDISRE_RM3TAV: + case estORIRE_INITF: + case estORIRE_DTAV: + /* Not implemented yet */ + break; + default: + gmx_incons("Unknown state entry encountered in dd_distribute_state"); + } + } + } +} + +static char dim2char(int dim) +{ + char c = '?'; + + switch (dim) + { + case XX: c = 'X'; break; + case YY: c = 'Y'; break; + case ZZ: c = 'Z'; break; + default: gmx_fatal(FARGS, "Unknown dim %d", dim); + } + + return c; +} + +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step, + gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox) +{ + rvec grid_s[2], *grid_r = NULL, cx, r; + char fname[STRLEN], format[STRLEN], buf[22]; + FILE *out; + int a, i, d, z, y, x; + matrix tric; + real vol; + + copy_rvec(dd->comm->cell_x0, grid_s[0]); + copy_rvec(dd->comm->cell_x1, grid_s[1]); + + if (DDMASTER(dd)) + { + snew(grid_r, 2*dd->nnodes); + } + + dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL); + + if (DDMASTER(dd)) + { + for (d = 0; d < DIM; d++) + { + for (i = 0; i < DIM; i++) + { + if (d == i) + { + tric[d][i] = 1; + } + else + { + if (d < ddbox->npbcdim && dd->nc[d] > 1) + { + tric[d][i] = box[i][d]/box[i][i]; + } + else + { + tric[d][i] = 0; + } + } + } + } + sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf)); + sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f"); + out = gmx_fio_fopen(fname, "w"); + gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box); + a = 1; + for (i = 0; i < dd->nnodes; i++) + { + vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]); + for (d = 0; d < DIM; d++) + { + vol *= grid_r[i*2+1][d] - grid_r[i*2][d]; + } + for (z = 0; z < 2; z++) + { + for (y = 0; y < 2; y++) + { + for (x = 0; x < 2; x++) + { + cx[XX] = grid_r[i*2+x][XX]; + cx[YY] = grid_r[i*2+y][YY]; + cx[ZZ] = grid_r[i*2+z][ZZ]; + mvmul(tric, cx, r); + fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i, + 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol); + } + } + } + for (d = 0; d < DIM; d++) + { + for (x = 0; x < 4; x++) + { + switch (d) + { + case 0: y = 1 + i*8 + 2*x; break; + case 1: y = 1 + i*8 + 2*x - (x % 2); break; + case 2: y = 1 + i*8 + x; break; + } + fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<dd; + if (natoms == -1) + { + natoms = dd->comm->nat[ddnatVSITE]; + } + + sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid); + + sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f"); + sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f"); + + out = gmx_fio_fopen(fname, "w"); + + fprintf(out, "TITLE %s\n", title); + gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box); + for (i = 0; i < natoms; i++) + { + ii = dd->gatindex[i]; + gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname); + if (i < dd->comm->nat[ddnatZONE]) + { + c = 0; + while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]]) + { + c++; + } + b = c; + } + else if (i < dd->comm->nat[ddnatVSITE]) + { + b = dd->comm->zones.n; + } + else + { + b = dd->comm->zones.n + 1; + } + fprintf(out, strlen(atomname) < 4 ? format : format4, + "ATOM", (ii+1)%100000, + atomname, resname, ' ', resnr%10000, ' ', + 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b); + } + fprintf(out, "TER\n"); + + gmx_fio_fclose(out); +} + +real dd_cutoff_mbody(gmx_domdec_t *dd) +{ + gmx_domdec_comm_t *comm; + int di; + real r; + + comm = dd->comm; + + r = -1; + if (comm->bInterCGBondeds) + { + if (comm->cutoff_mbody > 0) + { + r = comm->cutoff_mbody; + } + else + { + /* cutoff_mbody=0 means we do not have DLB */ + r = comm->cellsize_min[dd->dim[0]]; + for (di = 1; di < dd->ndim; di++) + { + r = min(r, comm->cellsize_min[dd->dim[di]]); + } + if (comm->bBondComm) + { + r = max(r, comm->cutoff_mbody); + } + else + { + r = min(r, comm->cutoff); + } + } + } + + return r; +} + +real dd_cutoff_twobody(gmx_domdec_t *dd) +{ + real r_mb; + + r_mb = dd_cutoff_mbody(dd); + + return max(dd->comm->cutoff, r_mb); +} + + +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme) +{ + int nc, ntot; + + nc = dd->nc[dd->comm->cartpmedim]; + ntot = dd->comm->ntot[dd->comm->cartpmedim]; + copy_ivec(coord, coord_pme); + coord_pme[dd->comm->cartpmedim] = + nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc; +} + +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex) +{ + /* Here we assign a PME node to communicate with this DD node + * by assuming that the major index of both is x. + * We add cr->npmenodes/2 to obtain an even distribution. + */ + return (ddindex*npme + npme/2)/ndd; +} + +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex) +{ + return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex); +} + +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex) +{ + return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex); +} + +static int *dd_pmenodes(t_commrec *cr) +{ + int *pmenodes; + int n, i, p0, p1; + + snew(pmenodes, cr->npmenodes); + n = 0; + for (i = 0; i < cr->dd->nnodes; i++) + { + p0 = cr_ddindex2pmeindex(cr, i); + p1 = cr_ddindex2pmeindex(cr, i+1); + if (i+1 == cr->dd->nnodes || p1 > p0) + { + if (debug) + { + fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n); + } + pmenodes[n] = i + 1 + n; + n++; + } + } + + return pmenodes; +} + +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z) +{ + gmx_domdec_t *dd; + ivec coords, coords_pme, nc; + int slab; + + dd = cr->dd; + /* + if (dd->comm->bCartesian) { + gmx_ddindex2xyz(dd->nc,ddindex,coords); + dd_coords2pmecoords(dd,coords,coords_pme); + copy_ivec(dd->ntot,nc); + nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim]; + coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim]; + + slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ]; + } else { + slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes; + } + */ + coords[XX] = x; + coords[YY] = y; + coords[ZZ] = z; + slab = ddindex2pmeindex(dd, dd_index(dd->nc, coords)); + + return slab; +} + +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z) +{ + gmx_domdec_comm_t *comm; + ivec coords; + int ddindex, nodeid = -1; + + comm = cr->dd->comm; + + coords[XX] = x; + coords[YY] = y; + coords[ZZ] = z; + if (comm->bCartesianPP_PME) + { +#ifdef GMX_MPI + MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid); +#endif + } + else + { + ddindex = dd_index(cr->dd->nc, coords); + if (comm->bCartesianPP) + { + nodeid = comm->ddindex2simnodeid[ddindex]; + } + else + { + if (comm->pmenodes) + { + nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z); + } + else + { + nodeid = ddindex; + } + } + } + + return nodeid; +} + +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + ivec coord, coord_pme; + int i; + int pmenode = -1; + + dd = cr->dd; + comm = dd->comm; + + /* This assumes a uniform x domain decomposition grid cell size */ + if (comm->bCartesianPP_PME) + { +#ifdef GMX_MPI + MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord); + if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim]) + { + /* This is a PP node */ + dd_cart_coord2pmecoord(dd, coord, coord_pme); + MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode); + } +#endif + } + else if (comm->bCartesianPP) + { + if (sim_nodeid < dd->nnodes) + { + pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid); + } + } + else + { + /* This assumes DD cells with identical x coordinates + * are numbered sequentially. + */ + if (dd->comm->pmenodes == NULL) + { + if (sim_nodeid < dd->nnodes) + { + /* The DD index equals the nodeid */ + pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid); + } + } + else + { + i = 0; + while (sim_nodeid > dd->comm->pmenodes[i]) + { + i++; + } + if (sim_nodeid < dd->comm->pmenodes[i]) + { + pmenode = dd->comm->pmenodes[i]; + } + } + } + + return pmenode; +} + +void get_pme_nnodes(const gmx_domdec_t *dd, + int *npmenodes_x, int *npmenodes_y) +{ + if (dd != NULL) + { + *npmenodes_x = dd->comm->npmenodes_x; + *npmenodes_y = dd->comm->npmenodes_y; + } + else + { + *npmenodes_x = 1; + *npmenodes_y = 1; + } +} + +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid) +{ + gmx_bool bPMEOnlyNode; + + if (DOMAINDECOMP(cr)) + { + bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1); + } + else + { + bPMEOnlyNode = FALSE; + } + + return bPMEOnlyNode; +} + +void get_pme_ddnodes(t_commrec *cr, int pmenodeid, + int *nmy_ddnodes, int **my_ddnodes, int *node_peer) +{ + gmx_domdec_t *dd; + int x, y, z; + ivec coord, coord_pme; + + dd = cr->dd; + + snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes); + + *nmy_ddnodes = 0; + for (x = 0; x < dd->nc[XX]; x++) + { + for (y = 0; y < dd->nc[YY]; y++) + { + for (z = 0; z < dd->nc[ZZ]; z++) + { + if (dd->comm->bCartesianPP_PME) + { + coord[XX] = x; + coord[YY] = y; + coord[ZZ] = z; + dd_cart_coord2pmecoord(dd, coord, coord_pme); + if (dd->ci[XX] == coord_pme[XX] && + dd->ci[YY] == coord_pme[YY] && + dd->ci[ZZ] == coord_pme[ZZ]) + { + (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z); + } + } + else + { + /* The slab corresponds to the nodeid in the PME group */ + if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid) + { + (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z); + } + } + } + } + } + + /* The last PP-only node is the peer node */ + *node_peer = (*my_ddnodes)[*nmy_ddnodes-1]; + + if (debug) + { + fprintf(debug, "Receive coordinates from PP nodes:"); + for (x = 0; x < *nmy_ddnodes; x++) + { + fprintf(debug, " %d", (*my_ddnodes)[x]); + } + fprintf(debug, "\n"); + } +} + +static gmx_bool receive_vir_ener(t_commrec *cr) +{ + gmx_domdec_comm_t *comm; + int pmenode, coords[DIM], rank; + gmx_bool bReceive; + + bReceive = TRUE; + if (cr->npmenodes < cr->dd->nnodes) + { + comm = cr->dd->comm; + if (comm->bCartesianPP_PME) + { + pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid); +#ifdef GMX_MPI + MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords); + coords[comm->cartpmedim]++; + if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim]) + { + MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank); + if (dd_simnode2pmenode(cr, rank) == pmenode) + { + /* This is not the last PP node for pmenode */ + bReceive = FALSE; + } + } +#endif + } + else + { + pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid); + if (cr->sim_nodeid+1 < cr->nnodes && + dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode) + { + /* This is not the last PP node for pmenode */ + bReceive = FALSE; + } + } + } + + return bReceive; +} + +static void set_zones_ncg_home(gmx_domdec_t *dd) +{ + gmx_domdec_zones_t *zones; + int i; + + zones = &dd->comm->zones; + + zones->cg_range[0] = 0; + for (i = 1; i < zones->n+1; i++) + { + zones->cg_range[i] = dd->ncg_home; + } ++ /* zone_ncg1[0] should always be equal to ncg_home */ ++ dd->comm->zone_ncg1[0] = dd->ncg_home; +} + +static void rebuild_cgindex(gmx_domdec_t *dd, + const int *gcgs_index, t_state *state) +{ + int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl; + + ind = state->cg_gl; + dd_cg_gl = dd->index_gl; + cgindex = dd->cgindex; + nat = 0; + cgindex[0] = nat; + for (i = 0; i < state->ncg_gl; i++) + { + cgindex[i] = nat; + cg_gl = ind[i]; + dd_cg_gl[i] = cg_gl; + nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl]; + } + cgindex[i] = nat; + + dd->ncg_home = state->ncg_gl; + dd->nat_home = nat; + + set_zones_ncg_home(dd); +} + +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg) +{ + while (cg >= cginfo_mb->cg_end) + { + cginfo_mb++; + } + + return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod]; +} + +static void dd_set_cginfo(int *index_gl, int cg0, int cg1, + t_forcerec *fr, char *bLocalCG) +{ + cginfo_mb_t *cginfo_mb; + int *cginfo; + int cg; + + if (fr != NULL) + { + cginfo_mb = fr->cginfo_mb; + cginfo = fr->cginfo; + + for (cg = cg0; cg < cg1; cg++) + { + cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]); + } + } + + if (bLocalCG != NULL) + { + for (cg = cg0; cg < cg1; cg++) + { + bLocalCG[index_gl[cg]] = TRUE; + } + } +} + +static void make_dd_indices(gmx_domdec_t *dd, + const int *gcgs_index, int cg_start) +{ + int nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl; + int *zone2cg, *zone_ncg1, *index_gl, *gatindex; + gmx_ga2la_t *ga2la; + char *bLocalCG; + gmx_bool bCGs; + + bLocalCG = dd->comm->bLocalCG; + + if (dd->nat_tot > dd->gatindex_nalloc) + { + dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot); + srenew(dd->gatindex, dd->gatindex_nalloc); + } + + nzone = dd->comm->zones.n; + zone2cg = dd->comm->zones.cg_range; + zone_ncg1 = dd->comm->zone_ncg1; + index_gl = dd->index_gl; + gatindex = dd->gatindex; + bCGs = dd->comm->bCGs; + + if (zone2cg[1] != dd->ncg_home) + { + gmx_incons("dd->ncg_zone is not up to date"); + } + + /* Make the local to global and global to local atom index */ + a = dd->cgindex[cg_start]; + for (zone = 0; zone < nzone; zone++) + { + if (zone == 0) + { + cg0 = cg_start; + } + else + { + cg0 = zone2cg[zone]; + } + cg1 = zone2cg[zone+1]; + cg1_p1 = cg0 + zone_ncg1[zone]; + + for (cg = cg0; cg < cg1; cg++) + { + zone1 = zone; + if (cg >= cg1_p1) + { + /* Signal that this cg is from more than one pulse away */ + zone1 += nzone; + } + cg_gl = index_gl[cg]; + if (bCGs) + { + for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++) + { + gatindex[a] = a_gl; + ga2la_set(dd->ga2la, a_gl, a, zone1); + a++; + } + } + else + { + gatindex[a] = cg_gl; + ga2la_set(dd->ga2la, cg_gl, a, zone1); + a++; + } + } + } +} + +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG, + const char *where) +{ + int ncg, i, ngl, nerr; + + nerr = 0; + if (bLocalCG == NULL) + { + return nerr; + } + for (i = 0; i < dd->ncg_tot; i++) + { + if (!bLocalCG[dd->index_gl[i]]) + { + fprintf(stderr, + "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home); + nerr++; + } + } + ngl = 0; + for (i = 0; i < ncg_sys; i++) + { + if (bLocalCG[i]) + { + ngl++; + } + } + if (ngl != dd->ncg_tot) + { + fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot); + nerr++; + } + + return nerr; +} + +static void check_index_consistency(gmx_domdec_t *dd, + int natoms_sys, int ncg_sys, + const char *where) +{ + int nerr, ngl, i, a, cell; + int *have; + + nerr = 0; + + if (dd->comm->DD_debug > 1) + { + snew(have, natoms_sys); + for (a = 0; a < dd->nat_tot; a++) + { + if (have[dd->gatindex[a]] > 0) + { + fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1); + } + else + { + have[dd->gatindex[a]] = a + 1; + } + } + sfree(have); + } + + snew(have, dd->nat_tot); + + ngl = 0; + for (i = 0; i < natoms_sys; i++) + { + if (ga2la_get(dd->ga2la, i, &a, &cell)) + { + if (a >= dd->nat_tot) + { + fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot); + nerr++; + } + else + { + have[a] = 1; + if (dd->gatindex[a] != i) + { + fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1); + nerr++; + } + } + ngl++; + } + } + if (ngl != dd->nat_tot) + { + fprintf(stderr, + "DD node %d, %s: %d global atom indices, %d local atoms\n", + dd->rank, where, ngl, dd->nat_tot); + } + for (a = 0; a < dd->nat_tot; a++) + { + if (have[a] == 0) + { + fprintf(stderr, + "DD node %d, %s: local atom %d, global %d has no global index\n", + dd->rank, where, a+1, dd->gatindex[a]+1); + } + } + sfree(have); + + nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where); + + if (nerr > 0) + { + gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies", + dd->rank, where, nerr); + } +} + +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start) +{ + int i; + char *bLocalCG; + + if (a_start == 0) + { + /* Clear the whole list without searching */ + ga2la_clear(dd->ga2la); + } + else + { + for (i = a_start; i < dd->nat_tot; i++) + { + ga2la_del(dd->ga2la, dd->gatindex[i]); + } + } + + bLocalCG = dd->comm->bLocalCG; + if (bLocalCG) + { + for (i = cg_start; i < dd->ncg_tot; i++) + { + bLocalCG[dd->index_gl[i]] = FALSE; + } + } + + dd_clear_local_vsite_indices(dd); + + if (dd->constraints) + { + dd_clear_local_constraint_indices(dd); + } +} + +/* This function should be used for moving the domain boudaries during DLB, + * for obtaining the minimum cell size. It checks the initially set limit + * comm->cellsize_min, for bonded and initial non-bonded cut-offs, + * and, possibly, a longer cut-off limit set for PME load balancing. + */ +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim) +{ + real cellsize_min; + + cellsize_min = comm->cellsize_min[dim]; + + if (!comm->bVacDLBNoLimit) + { + /* The cut-off might have changed, e.g. by PME load balacning, + * from the value used to set comm->cellsize_min, so check it. + */ + cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb); + + if (comm->bPMELoadBalDLBLimits) + { + /* Check for the cut-off limit set by the PME load balancing */ + cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb); + } + } + + return cellsize_min; +} + +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff, + int dim_ind) +{ + real grid_jump_limit; + + /* The distance between the boundaries of cells at distance + * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions + * and by the fact that cells should not be shifted by more than + * half their size, such that cg's only shift by one cell + * at redecomposition. + */ + grid_jump_limit = comm->cellsize_limit; + if (!comm->bVacDLBNoLimit) + { + if (comm->bPMELoadBalDLBLimits) + { + cutoff = max(cutoff, comm->PMELoadBal_max_cutoff); + } + grid_jump_limit = max(grid_jump_limit, + cutoff/comm->cd[dim_ind].np); + } + + return grid_jump_limit; +} + +static gmx_bool check_grid_jump(gmx_large_int_t step, + gmx_domdec_t *dd, + real cutoff, + gmx_ddbox_t *ddbox, + gmx_bool bFatal) +{ + gmx_domdec_comm_t *comm; + int d, dim; + real limit, bfac; + gmx_bool bInvalid; + + bInvalid = FALSE; + + comm = dd->comm; + + for (d = 1; d < dd->ndim; d++) + { + dim = dd->dim[d]; + limit = grid_jump_limit(comm, cutoff, d); + bfac = ddbox->box_size[dim]; + if (ddbox->tric_dir[dim]) + { + bfac *= ddbox->skew_fac[dim]; + } + if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit || + (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit) + { + bInvalid = TRUE; + + if (bFatal) + { + char buf[22]; + + /* This error should never be triggered under normal + * circumstances, but you never know ... + */ + gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.", + gmx_step_str(step, buf), + dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } + } + } + + return bInvalid; +} + +static int dd_load_count(gmx_domdec_comm_t *comm) +{ + return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]); +} + +static float dd_force_load(gmx_domdec_comm_t *comm) +{ + float load; + + if (comm->eFlop) + { + load = comm->flop; + if (comm->eFlop > 1) + { + load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05); + } + } + else + { + load = comm->cycl[ddCyclF]; + if (comm->cycl_n[ddCyclF] > 1) + { + /* Subtract the maximum of the last n cycle counts + * to get rid of possible high counts due to other soures, + * for instance system activity, that would otherwise + * affect the dynamic load balancing. + */ + load -= comm->cycl_max[ddCyclF]; + } + } + + return load; +} + +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f) +{ + gmx_domdec_comm_t *comm; + int i; + + comm = dd->comm; + + snew(*dim_f, dd->nc[dim]+1); + (*dim_f)[0] = 0; + for (i = 1; i < dd->nc[dim]; i++) + { + if (comm->slb_frac[dim]) + { + (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1]; + } + else + { + (*dim_f)[i] = (real)i/(real)dd->nc[dim]; + } + } + (*dim_f)[dd->nc[dim]] = 1; +} + +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind) +{ + int pmeindex, slab, nso, i; + ivec xyz; + + if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1) + { + ddpme->dim = YY; + } + else + { + ddpme->dim = dimind; + } + ddpme->dim_match = (ddpme->dim == dd->dim[dimind]); + + ddpme->nslab = (ddpme->dim == 0 ? + dd->comm->npmenodes_x : + dd->comm->npmenodes_y); + + if (ddpme->nslab <= 1) + { + return; + } + + nso = dd->comm->npmenodes/ddpme->nslab; + /* Determine for each PME slab the PP location range for dimension dim */ + snew(ddpme->pp_min, ddpme->nslab); + snew(ddpme->pp_max, ddpme->nslab); + for (slab = 0; slab < ddpme->nslab; slab++) + { + ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1; + ddpme->pp_max[slab] = 0; + } + for (i = 0; i < dd->nnodes; i++) + { + ddindex2xyz(dd->nc, i, xyz); + /* For y only use our y/z slab. + * This assumes that the PME x grid size matches the DD grid size. + */ + if (dimind == 0 || xyz[XX] == dd->ci[XX]) + { + pmeindex = ddindex2pmeindex(dd, i); + if (dimind == 0) + { + slab = pmeindex/nso; + } + else + { + slab = pmeindex % ddpme->nslab; + } + ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]); + ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]); + } + } + + set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f); +} + +int dd_pme_maxshift_x(gmx_domdec_t *dd) +{ + if (dd->comm->ddpme[0].dim == XX) + { + return dd->comm->ddpme[0].maxshift; + } + else + { + return 0; + } +} + +int dd_pme_maxshift_y(gmx_domdec_t *dd) +{ + if (dd->comm->ddpme[0].dim == YY) + { + return dd->comm->ddpme[0].maxshift; + } + else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY) + { + return dd->comm->ddpme[1].maxshift; + } + else + { + return 0; + } +} + +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, + gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f) +{ + gmx_domdec_comm_t *comm; + int nc, ns, s; + int *xmin, *xmax; + real range, pme_boundary; + int sh; + + comm = dd->comm; + nc = dd->nc[ddpme->dim]; + ns = ddpme->nslab; + + if (!ddpme->dim_match) + { + /* PP decomposition is not along dim: the worst situation */ + sh = ns/2; + } + else if (ns <= 3 || (bUniform && ns == nc)) + { + /* The optimal situation */ + sh = 1; + } + else + { + /* We need to check for all pme nodes which nodes they + * could possibly need to communicate with. + */ + xmin = ddpme->pp_min; + xmax = ddpme->pp_max; + /* Allow for atoms to be maximally 2/3 times the cut-off + * out of their DD cell. This is a reasonable balance between + * between performance and support for most charge-group/cut-off + * combinations. + */ + range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim]; + /* Avoid extra communication when we are exactly at a boundary */ + range *= 0.999; + + sh = 1; + for (s = 0; s < ns; s++) + { + /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */ + pme_boundary = (real)s/ns; + while (sh+1 < ns && + ((s-(sh+1) >= 0 && + cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) || + (s-(sh+1) < 0 && + cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary))) + { + sh++; + } + pme_boundary = (real)(s+1)/ns; + while (sh+1 < ns && + ((s+(sh+1) < ns && + cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) || + (s+(sh+1) >= ns && + cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary))) + { + sh++; + } + } + } + + ddpme->maxshift = sh; + + if (debug) + { + fprintf(debug, "PME slab communication range for dim %d is %d\n", + ddpme->dim, ddpme->maxshift); + } +} + +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox) +{ + int d, dim; + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + if (dim < ddbox->nboundeddim && + ddbox->box_size[dim]*ddbox->skew_fac[dim] < + dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN) + { + gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n", + dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim], + dd->nc[dim], dd->comm->cellsize_limit); + } + } +} + +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox, + gmx_bool bMaster, ivec npulse) +{ + gmx_domdec_comm_t *comm; + int d, j; + rvec cellsize_min; + real *cell_x, cell_dx, cellsize; + + comm = dd->comm; + + for (d = 0; d < DIM; d++) + { + cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d]; + npulse[d] = 1; + if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL) + { + /* Uniform grid */ + cell_dx = ddbox->box_size[d]/dd->nc[d]; + if (bMaster) + { + for (j = 0; j < dd->nc[d]+1; j++) + { + dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx; + } + } + else + { + comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx; + comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx; + } + cellsize = cell_dx*ddbox->skew_fac[d]; + while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1) + { + npulse[d]++; + } + cellsize_min[d] = cellsize; + } + else + { + /* Statically load balanced grid */ + /* Also when we are not doing a master distribution we determine + * all cell borders in a loop to obtain identical values + * to the master distribution case and to determine npulse. + */ + if (bMaster) + { + cell_x = dd->ma->cell_x[d]; + } + else + { + snew(cell_x, dd->nc[d]+1); + } + cell_x[0] = ddbox->box0[d]; + for (j = 0; j < dd->nc[d]; j++) + { + cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j]; + cell_x[j+1] = cell_x[j] + cell_dx; + cellsize = cell_dx*ddbox->skew_fac[d]; + while (cellsize*npulse[d] < comm->cutoff && + npulse[d] < dd->nc[d]-1) + { + npulse[d]++; + } + cellsize_min[d] = min(cellsize_min[d], cellsize); + } + if (!bMaster) + { + comm->cell_x0[d] = cell_x[dd->ci[d]]; + comm->cell_x1[d] = cell_x[dd->ci[d]+1]; + sfree(cell_x); + } + } + /* The following limitation is to avoid that a cell would receive + * some of its own home charge groups back over the periodic boundary. + * Double charge groups cause trouble with the global indices. + */ + if (d < ddbox->npbcdim && + dd->nc[d] > 1 && npulse[d] >= dd->nc[d]) + { + gmx_fatal_collective(FARGS, NULL, dd, + "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction", + dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d], + comm->cutoff, + dd->nc[d], dd->nc[d], + dd->nnodes > dd->nc[d] ? "cells" : "processors"); + } + } + + if (!comm->bDynLoadBal) + { + copy_rvec(cellsize_min, comm->cellsize_min); + } + + for (d = 0; d < comm->npmedecompdim; d++) + { + set_pme_maxshift(dd, &comm->ddpme[d], + comm->slb_frac[dd->dim[d]] == NULL, ddbox, + comm->ddpme[d].slb_dim_f); + } +} + + +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd, + int d, int dim, gmx_domdec_root_t *root, + gmx_ddbox_t *ddbox, + gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[]) +{ + gmx_domdec_comm_t *comm; + int ncd, i, j, nmin, nmin_old; + gmx_bool bLimLo, bLimHi; + real *cell_size; + real fac, halfway, cellsize_limit_f_i, region_size; + gmx_bool bPBC, bLastHi = FALSE; + int nrange[] = {range[0], range[1]}; + + region_size = root->cell_f[range[1]]-root->cell_f[range[0]]; + + comm = dd->comm; + + ncd = dd->nc[dim]; + + bPBC = (dim < ddbox->npbcdim); + + cell_size = root->buf_ncd; + + if (debug) + { + fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]); + } + + /* First we need to check if the scaling does not make cells + * smaller than the smallest allowed size. + * We need to do this iteratively, since if a cell is too small, + * it needs to be enlarged, which makes all the other cells smaller, + * which could in turn make another cell smaller than allowed. + */ + for (i = range[0]; i < range[1]; i++) + { + root->bCellMin[i] = FALSE; + } + nmin = 0; + do + { + nmin_old = nmin; + /* We need the total for normalization */ + fac = 0; + for (i = range[0]; i < range[1]; i++) + { + if (root->bCellMin[i] == FALSE) + { + fac += cell_size[i]; + } + } + fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */ + /* Determine the cell boundaries */ + for (i = range[0]; i < range[1]; i++) + { + if (root->bCellMin[i] == FALSE) + { + cell_size[i] *= fac; + if (!bPBC && (i == 0 || i == dd->nc[dim] -1)) + { + cellsize_limit_f_i = 0; + } + else + { + cellsize_limit_f_i = cellsize_limit_f; + } + if (cell_size[i] < cellsize_limit_f_i) + { + root->bCellMin[i] = TRUE; + cell_size[i] = cellsize_limit_f_i; + nmin++; + } + } + root->cell_f[i+1] = root->cell_f[i] + cell_size[i]; + } + } + while (nmin > nmin_old); + + i = range[1]-1; + cell_size[i] = root->cell_f[i+1] - root->cell_f[i]; + /* For this check we should not use DD_CELL_MARGIN, + * but a slightly smaller factor, + * since rounding could get use below the limit. + */ + if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN) + { + char buf[22]; + gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n", + gmx_step_str(step, buf), + dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim], + ncd, comm->cellsize_min[dim]); + } + + root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd); + + if (!bUniform) + { + /* Check if the boundary did not displace more than halfway + * each of the cells it bounds, as this could cause problems, + * especially when the differences between cell sizes are large. + * If changes are applied, they will not make cells smaller + * than the cut-off, as we check all the boundaries which + * might be affected by a change and if the old state was ok, + * the cells will at most be shrunk back to their old size. + */ + for (i = range[0]+1; i < range[1]; i++) + { + halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]); + if (root->cell_f[i] < halfway) + { + root->cell_f[i] = halfway; + /* Check if the change also causes shifts of the next boundaries */ + for (j = i+1; j < range[1]; j++) + { + if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f) + { + root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f; + } + } + } + halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]); + if (root->cell_f[i] > halfway) + { + root->cell_f[i] = halfway; + /* Check if the change also causes shifts of the next boundaries */ + for (j = i-1; j >= range[0]+1; j--) + { + if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f) + { + root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f; + } + } + } + } + } + + /* nrange is defined as [lower, upper) range for new call to enforce_limits */ + /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b) + * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries. + * for a and b nrange is used */ + if (d > 0) + { + /* Take care of the staggering of the cell boundaries */ + if (bUniform) + { + for (i = range[0]; i < range[1]; i++) + { + root->cell_f_max0[i] = root->cell_f[i]; + root->cell_f_min1[i] = root->cell_f[i+1]; + } + } + else + { + for (i = range[0]+1; i < range[1]; i++) + { + bLimLo = (root->cell_f[i] < root->bound_min[i]); + bLimHi = (root->cell_f[i] > root->bound_max[i]); + if (bLimLo && bLimHi) + { + /* Both limits violated, try the best we can */ + /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */ + root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]); + nrange[0] = range[0]; + nrange[1] = i; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + + nrange[0] = i; + nrange[1] = range[1]; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + + return; + } + else if (bLimLo) + { + /* root->cell_f[i] = root->bound_min[i]; */ + nrange[1] = i; /* only store violation location. There could be a LimLo violation following with an higher index */ + bLastHi = FALSE; + } + else if (bLimHi && !bLastHi) + { + bLastHi = TRUE; + if (nrange[1] < range[1]) /* found a LimLo before */ + { + root->cell_f[nrange[1]] = root->bound_min[nrange[1]]; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + nrange[0] = nrange[1]; + } + root->cell_f[i] = root->bound_max[i]; + nrange[1] = i; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + nrange[0] = i; + nrange[1] = range[1]; + } + } + if (nrange[1] < range[1]) /* found last a LimLo */ + { + root->cell_f[nrange[1]] = root->bound_min[nrange[1]]; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + nrange[0] = nrange[1]; + nrange[1] = range[1]; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + } + else if (nrange[0] > range[0]) /* found at least one LimHi */ + { + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange); + } + } + } +} + + +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd, + int d, int dim, gmx_domdec_root_t *root, + gmx_ddbox_t *ddbox, gmx_bool bDynamicBox, + gmx_bool bUniform, gmx_large_int_t step) +{ + gmx_domdec_comm_t *comm; + int ncd, d1, i, j, pos; + real *cell_size; + real load_aver, load_i, imbalance, change, change_max, sc; + real cellsize_limit_f, dist_min_f, dist_min_f_hard, space; + real change_limit; + real relax = 0.5; + gmx_bool bPBC; + int range[] = { 0, 0 }; + + comm = dd->comm; + + /* Convert the maximum change from the input percentage to a fraction */ + change_limit = comm->dlb_scale_lim*0.01; + + ncd = dd->nc[dim]; + + bPBC = (dim < ddbox->npbcdim); + + cell_size = root->buf_ncd; + + /* Store the original boundaries */ + for (i = 0; i < ncd+1; i++) + { + root->old_cell_f[i] = root->cell_f[i]; + } + if (bUniform) + { + for (i = 0; i < ncd; i++) + { + cell_size[i] = 1.0/ncd; + } + } + else if (dd_load_count(comm)) + { + load_aver = comm->load[d].sum_m/ncd; + change_max = 0; + for (i = 0; i < ncd; i++) + { + /* Determine the relative imbalance of cell i */ + load_i = comm->load[d].load[i*comm->load[d].nload+2]; + imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1); + /* Determine the change of the cell size using underrelaxation */ + change = -relax*imbalance; + change_max = max(change_max, max(change, -change)); + } + /* Limit the amount of scaling. + * We need to use the same rescaling for all cells in one row, + * otherwise the load balancing might not converge. + */ + sc = relax; + if (change_max > change_limit) + { + sc *= change_limit/change_max; + } + for (i = 0; i < ncd; i++) + { + /* Determine the relative imbalance of cell i */ + load_i = comm->load[d].load[i*comm->load[d].nload+2]; + imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1); + /* Determine the change of the cell size using underrelaxation */ + change = -sc*imbalance; + cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change); + } + } + + cellsize_limit_f = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim]; + cellsize_limit_f *= DD_CELL_MARGIN; + dist_min_f_hard = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim]; + dist_min_f = dist_min_f_hard * DD_CELL_MARGIN; + if (ddbox->tric_dir[dim]) + { + cellsize_limit_f /= ddbox->skew_fac[dim]; + dist_min_f /= ddbox->skew_fac[dim]; + } + if (bDynamicBox && d > 0) + { + dist_min_f *= DD_PRES_SCALE_MARGIN; + } + if (d > 0 && !bUniform) + { + /* Make sure that the grid is not shifted too much */ + for (i = 1; i < ncd; i++) + { + if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) + { + gmx_incons("Inconsistent DD boundary staggering limits!"); + } + root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f; + space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f); + if (space > 0) + { + root->bound_min[i] += 0.5*space; + } + root->bound_max[i] = root->cell_f_min1[i] - dist_min_f; + space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f); + if (space < 0) + { + root->bound_max[i] += 0.5*space; + } + if (debug) + { + fprintf(debug, + "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n", + d, i, + root->cell_f_max0[i-1] + dist_min_f, + root->bound_min[i], root->cell_f[i], root->bound_max[i], + root->cell_f_min1[i] - dist_min_f); + } + } + } + range[1] = ncd; + root->cell_f[0] = 0; + root->cell_f[ncd] = 1; + dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range); + + + /* After the checks above, the cells should obey the cut-off + * restrictions, but it does not hurt to check. + */ + for (i = 0; i < ncd; i++) + { + if (debug) + { + fprintf(debug, "Relative bounds dim %d cell %d: %f %f\n", + dim, i, root->cell_f[i], root->cell_f[i+1]); + } + + if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) && + root->cell_f[i+1] - root->cell_f[i] < + cellsize_limit_f/DD_CELL_MARGIN) + { + char buf[22]; + fprintf(stderr, + "\nWARNING step %s: direction %c, cell %d too small: %f\n", + gmx_step_str(step, buf), dim2char(dim), i, + (root->cell_f[i+1] - root->cell_f[i]) + *ddbox->box_size[dim]*ddbox->skew_fac[dim]); + } + } + + pos = ncd + 1; + /* Store the cell boundaries of the lower dimensions at the end */ + for (d1 = 0; d1 < d; d1++) + { + root->cell_f[pos++] = comm->cell_f0[d1]; + root->cell_f[pos++] = comm->cell_f1[d1]; + } + + if (d < comm->npmedecompdim) + { + /* The master determines the maximum shift for + * the coordinate communication between separate PME nodes. + */ + set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f); + } + root->cell_f[pos++] = comm->ddpme[0].maxshift; + if (d >= 1) + { + root->cell_f[pos++] = comm->ddpme[1].maxshift; + } +} + +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd, + gmx_ddbox_t *ddbox, int dimind) +{ + gmx_domdec_comm_t *comm; + int dim; + + comm = dd->comm; + + /* Set the cell dimensions */ + dim = dd->dim[dimind]; + comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim]; + comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim]; + if (dim >= ddbox->nboundeddim) + { + comm->cell_x0[dim] += ddbox->box0[dim]; + comm->cell_x1[dim] += ddbox->box0[dim]; + } +} + +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd, + int d, int dim, real *cell_f_row, + gmx_ddbox_t *ddbox) +{ + gmx_domdec_comm_t *comm; + int d1, dim1, pos; + + comm = dd->comm; + +#ifdef GMX_MPI + /* Each node would only need to know two fractions, + * but it is probably cheaper to broadcast the whole array. + */ + MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE, + 0, comm->mpi_comm_load[d]); +#endif + /* Copy the fractions for this dimension from the buffer */ + comm->cell_f0[d] = cell_f_row[dd->ci[dim] ]; + comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1]; + /* The whole array was communicated, so set the buffer position */ + pos = dd->nc[dim] + 1; + for (d1 = 0; d1 <= d; d1++) + { + if (d1 < d) + { + /* Copy the cell fractions of the lower dimensions */ + comm->cell_f0[d1] = cell_f_row[pos++]; + comm->cell_f1[d1] = cell_f_row[pos++]; + } + relative_to_absolute_cell_bounds(dd, ddbox, d1); + } + /* Convert the communicated shift from float to int */ + comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5); + if (d >= 1) + { + comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5); + } +} + +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd, + gmx_ddbox_t *ddbox, gmx_bool bDynamicBox, + gmx_bool bUniform, gmx_large_int_t step) +{ + gmx_domdec_comm_t *comm; + int d, dim, d1; + gmx_bool bRowMember, bRowRoot; + real *cell_f_row; + + comm = dd->comm; + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + bRowMember = TRUE; + bRowRoot = TRUE; + for (d1 = d; d1 < dd->ndim; d1++) + { + if (dd->ci[dd->dim[d1]] > 0) + { + if (d1 > d) + { + bRowMember = FALSE; + } + bRowRoot = FALSE; + } + } + if (bRowMember) + { + if (bRowRoot) + { + set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d], + ddbox, bDynamicBox, bUniform, step); + cell_f_row = comm->root[d]->cell_f; + } + else + { + cell_f_row = comm->cell_f_row; + } + distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox); + } + } +} + +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox) +{ + int d; + + /* This function assumes the box is static and should therefore + * not be called when the box has changed since the last + * call to dd_partition_system. + */ + for (d = 0; d < dd->ndim; d++) + { + relative_to_absolute_cell_bounds(dd, ddbox, d); + } +} + + + +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd, + gmx_ddbox_t *ddbox, gmx_bool bDynamicBox, + gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step, + gmx_wallcycle_t wcycle) +{ + gmx_domdec_comm_t *comm; + int dim; + + comm = dd->comm; + + if (bDoDLB) + { + wallcycle_start(wcycle, ewcDDCOMMBOUND); + set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step); + wallcycle_stop(wcycle, ewcDDCOMMBOUND); + } + else if (bDynamicBox) + { + set_dd_cell_sizes_dlb_nochange(dd, ddbox); + } + + /* Set the dimensions for which no DD is used */ + for (dim = 0; dim < DIM; dim++) + { + if (dd->nc[dim] == 1) + { + comm->cell_x0[dim] = 0; + comm->cell_x1[dim] = ddbox->box_size[dim]; + if (dim >= ddbox->nboundeddim) + { + comm->cell_x0[dim] += ddbox->box0[dim]; + comm->cell_x1[dim] += ddbox->box0[dim]; + } + } + } +} + +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse) +{ + int d, np, i; + gmx_domdec_comm_dim_t *cd; + + for (d = 0; d < dd->ndim; d++) + { + cd = &dd->comm->cd[d]; + np = npulse[dd->dim[d]]; + if (np > cd->np_nalloc) + { + if (debug) + { + fprintf(debug, "(Re)allocing cd for %c to %d pulses\n", + dim2char(dd->dim[d]), np); + } + if (DDMASTER(dd) && cd->np_nalloc > 0) + { + fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np); + } + srenew(cd->ind, np); + for (i = cd->np_nalloc; i < np; i++) + { + cd->ind[i].index = NULL; + cd->ind[i].nalloc = 0; + } + cd->np_nalloc = np; + } + cd->np = np; + } +} + + +static void set_dd_cell_sizes(gmx_domdec_t *dd, + gmx_ddbox_t *ddbox, gmx_bool bDynamicBox, + gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step, + gmx_wallcycle_t wcycle) +{ + gmx_domdec_comm_t *comm; + int d; + ivec npulse; + + comm = dd->comm; + + /* Copy the old cell boundaries for the cg displacement check */ + copy_rvec(comm->cell_x0, comm->old_cell_x0); + copy_rvec(comm->cell_x1, comm->old_cell_x1); + + if (comm->bDynLoadBal) + { + if (DDMASTER(dd)) + { + check_box_size(dd, ddbox); + } + set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle); + } + else + { + set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse); + realloc_comm_ind(dd, npulse); + } + + if (debug) + { + for (d = 0; d < DIM; d++) + { + fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n", + d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]); + } + } +} + +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd, + gmx_ddbox_t *ddbox, + rvec cell_ns_x0, rvec cell_ns_x1, + gmx_large_int_t step) +{ + gmx_domdec_comm_t *comm; + int dim_ind, dim; + + comm = dd->comm; + + for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++) + { + dim = dd->dim[dim_ind]; + + /* Without PBC we don't have restrictions on the outer cells */ + if (!(dim >= ddbox->npbcdim && + (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) && + comm->bDynLoadBal && + (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] < + comm->cellsize_min[dim]) + { + char buf[22]; + gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d", + gmx_step_str(step, buf), dim2char(dim), + comm->cell_x1[dim] - comm->cell_x0[dim], + ddbox->skew_fac[dim], + dd->comm->cellsize_min[dim], + dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } + } + + if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM) + { + /* Communicate the boundaries and update cell_ns_x0/1 */ + dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1); + if (dd->bGridJump && dd->ndim > 1) + { + check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE); + } + } +} + +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm) +{ + if (YY < npbcdim) + { + tcm[YY][XX] = -box[YY][XX]/box[YY][YY]; + } + else + { + tcm[YY][XX] = 0; + } + if (ZZ < npbcdim) + { + tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ]; + tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ]; + } + else + { + tcm[ZZ][XX] = 0; + tcm[ZZ][YY] = 0; + } +} + +static void check_screw_box(matrix box) +{ + /* Mathematical limitation */ + if (box[YY][XX] != 0 || box[ZZ][XX] != 0) + { + gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components"); + } + + /* Limitation due to the asymmetry of the eighth shell method */ + if (box[ZZ][YY] != 0) + { + gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported"); + } +} + +static void distribute_cg(FILE *fplog, gmx_large_int_t step, + matrix box, ivec tric_dir, t_block *cgs, rvec pos[], + gmx_domdec_t *dd) +{ + gmx_domdec_master_t *ma; + int **tmp_ind = NULL, *tmp_nalloc = NULL; + int i, icg, j, k, k0, k1, d, npbcdim; + matrix tcm; + rvec box_size, cg_cm; + ivec ind; + real nrcg, inv_ncg, pos_d; + atom_id *cgindex; + gmx_bool bUnbounded, bScrew; + + ma = dd->ma; + + if (tmp_ind == NULL) + { + snew(tmp_nalloc, dd->nnodes); + snew(tmp_ind, dd->nnodes); + for (i = 0; i < dd->nnodes; i++) + { + tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1); + snew(tmp_ind[i], tmp_nalloc[i]); + } + } + + /* Clear the count */ + for (i = 0; i < dd->nnodes; i++) + { + ma->ncg[i] = 0; + ma->nat[i] = 0; + } + + make_tric_corr_matrix(dd->npbcdim, box, tcm); + + cgindex = cgs->index; + + /* Compute the center of geometry for all charge groups */ + for (icg = 0; icg < cgs->nr; icg++) + { + k0 = cgindex[icg]; + k1 = cgindex[icg+1]; + nrcg = k1 - k0; + if (nrcg == 1) + { + copy_rvec(pos[k0], cg_cm); + } + else + { + inv_ncg = 1.0/nrcg; + + clear_rvec(cg_cm); + for (k = k0; (k < k1); k++) + { + rvec_inc(cg_cm, pos[k]); + } + for (d = 0; (d < DIM); d++) + { + cg_cm[d] *= inv_ncg; + } + } + /* Put the charge group in the box and determine the cell index */ + for (d = DIM-1; d >= 0; d--) + { + pos_d = cg_cm[d]; + if (d < dd->npbcdim) + { + bScrew = (dd->bScrewPBC && d == XX); + if (tric_dir[d] && dd->nc[d] > 1) + { + /* Use triclinic coordintates for this dimension */ + for (j = d+1; j < DIM; j++) + { + pos_d += cg_cm[j]*tcm[j][d]; + } + } + while (pos_d >= box[d][d]) + { + pos_d -= box[d][d]; + rvec_dec(cg_cm, box[d]); + if (bScrew) + { + cg_cm[YY] = box[YY][YY] - cg_cm[YY]; + cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ]; + } + for (k = k0; (k < k1); k++) + { + rvec_dec(pos[k], box[d]); + if (bScrew) + { + pos[k][YY] = box[YY][YY] - pos[k][YY]; + pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ]; + } + } + } + while (pos_d < 0) + { + pos_d += box[d][d]; + rvec_inc(cg_cm, box[d]); + if (bScrew) + { + cg_cm[YY] = box[YY][YY] - cg_cm[YY]; + cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ]; + } + for (k = k0; (k < k1); k++) + { + rvec_inc(pos[k], box[d]); + if (bScrew) + { + pos[k][YY] = box[YY][YY] - pos[k][YY]; + pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ]; + } + } + } + } + /* This could be done more efficiently */ + ind[d] = 0; + while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1]) + { + ind[d]++; + } + } + i = dd_index(dd->nc, ind); + if (ma->ncg[i] == tmp_nalloc[i]) + { + tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1); + srenew(tmp_ind[i], tmp_nalloc[i]); + } + tmp_ind[i][ma->ncg[i]] = icg; + ma->ncg[i]++; + ma->nat[i] += cgindex[icg+1] - cgindex[icg]; + } + + k1 = 0; + for (i = 0; i < dd->nnodes; i++) + { + ma->index[i] = k1; + for (k = 0; k < ma->ncg[i]; k++) + { + ma->cg[k1++] = tmp_ind[i][k]; + } + } + ma->index[dd->nnodes] = k1; + + for (i = 0; i < dd->nnodes; i++) + { + sfree(tmp_ind[i]); + } + sfree(tmp_ind); + sfree(tmp_nalloc); + + if (fplog) + { + char buf[22]; + fprintf(fplog, "Charge group distribution at step %s:", + gmx_step_str(step, buf)); + for (i = 0; i < dd->nnodes; i++) + { + fprintf(fplog, " %d", ma->ncg[i]); + } + fprintf(fplog, "\n"); + } +} + +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd, + t_block *cgs, matrix box, gmx_ddbox_t *ddbox, + rvec pos[]) +{ + gmx_domdec_master_t *ma = NULL; + ivec npulse; + int i, cg_gl; + int *ibuf, buf2[2] = { 0, 0 }; + gmx_bool bMaster = DDMASTER(dd); + if (bMaster) + { + ma = dd->ma; + + if (dd->bScrewPBC) + { + check_screw_box(box); + } + + set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse); + + distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd); + for (i = 0; i < dd->nnodes; i++) + { + ma->ibuf[2*i] = ma->ncg[i]; + ma->ibuf[2*i+1] = ma->nat[i]; + } + ibuf = ma->ibuf; + } + else + { + ibuf = NULL; + } + dd_scatter(dd, 2*sizeof(int), ibuf, buf2); + + dd->ncg_home = buf2[0]; + dd->nat_home = buf2[1]; + dd->ncg_tot = dd->ncg_home; + dd->nat_tot = dd->nat_home; + if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0) + { + dd->cg_nalloc = over_alloc_dd(dd->ncg_home); + srenew(dd->index_gl, dd->cg_nalloc); + srenew(dd->cgindex, dd->cg_nalloc+1); + } + if (bMaster) + { + for (i = 0; i < dd->nnodes; i++) + { + ma->ibuf[i] = ma->ncg[i]*sizeof(int); + ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int); + } + } + + dd_scatterv(dd, + DDMASTER(dd) ? ma->ibuf : NULL, + DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL, + DDMASTER(dd) ? ma->cg : NULL, + dd->ncg_home*sizeof(int), dd->index_gl); + + /* Determine the home charge group sizes */ + dd->cgindex[0] = 0; + for (i = 0; i < dd->ncg_home; i++) + { + cg_gl = dd->index_gl[i]; + dd->cgindex[i+1] = + dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl]; + } + + if (debug) + { + fprintf(debug, "Home charge groups:\n"); + for (i = 0; i < dd->ncg_home; i++) + { + fprintf(debug, " %d", dd->index_gl[i]); + if (i % 10 == 9) + { + fprintf(debug, "\n"); + } + } + fprintf(debug, "\n"); + } +} + +static int compact_and_copy_vec_at(int ncg, int *move, + int *cgindex, + int nvec, int vec, + rvec *src, gmx_domdec_comm_t *comm, + gmx_bool bCompact) +{ + int m, icg, i, i0, i1, nrcg; + int home_pos; + int pos_vec[DIM*2]; + + home_pos = 0; + + for (m = 0; m < DIM*2; m++) + { + pos_vec[m] = 0; + } + + i0 = 0; + for (icg = 0; icg < ncg; icg++) + { + i1 = cgindex[icg+1]; + m = move[icg]; + if (m == -1) + { + if (bCompact) + { + /* Compact the home array in place */ + for (i = i0; i < i1; i++) + { + copy_rvec(src[i], src[home_pos++]); + } + } + } + else + { + /* Copy to the communication buffer */ + nrcg = i1 - i0; + pos_vec[m] += 1 + vec*nrcg; + for (i = i0; i < i1; i++) + { + copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]); + } + pos_vec[m] += (nvec - vec - 1)*nrcg; + } + if (!bCompact) + { + home_pos += i1 - i0; + } + i0 = i1; + } + + return home_pos; +} + +static int compact_and_copy_vec_cg(int ncg, int *move, + int *cgindex, + int nvec, rvec *src, gmx_domdec_comm_t *comm, + gmx_bool bCompact) +{ + int m, icg, i0, i1, nrcg; + int home_pos; + int pos_vec[DIM*2]; + + home_pos = 0; + + for (m = 0; m < DIM*2; m++) + { + pos_vec[m] = 0; + } + + i0 = 0; + for (icg = 0; icg < ncg; icg++) + { + i1 = cgindex[icg+1]; + m = move[icg]; + if (m == -1) + { + if (bCompact) + { + /* Compact the home array in place */ + copy_rvec(src[icg], src[home_pos++]); + } + } + else + { + nrcg = i1 - i0; + /* Copy to the communication buffer */ + copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]); + pos_vec[m] += 1 + nrcg*nvec; + } + i0 = i1; + } + if (!bCompact) + { + home_pos = ncg; + } + + return home_pos; +} + +static int compact_ind(int ncg, int *move, + int *index_gl, int *cgindex, + int *gatindex, + gmx_ga2la_t ga2la, char *bLocalCG, + int *cginfo) +{ + int cg, nat, a0, a1, a, a_gl; + int home_pos; + + home_pos = 0; + nat = 0; + for (cg = 0; cg < ncg; cg++) + { + a0 = cgindex[cg]; + a1 = cgindex[cg+1]; + if (move[cg] == -1) + { + /* Compact the home arrays in place. + * Anything that can be done here avoids access to global arrays. + */ + cgindex[home_pos] = nat; + for (a = a0; a < a1; a++) + { + a_gl = gatindex[a]; + gatindex[nat] = a_gl; + /* The cell number stays 0, so we don't need to set it */ + ga2la_change_la(ga2la, a_gl, nat); + nat++; + } + index_gl[home_pos] = index_gl[cg]; + cginfo[home_pos] = cginfo[cg]; + /* The charge group remains local, so bLocalCG does not change */ + home_pos++; + } + else + { + /* Clear the global indices */ + for (a = a0; a < a1; a++) + { + ga2la_del(ga2la, gatindex[a]); + } + if (bLocalCG) + { + bLocalCG[index_gl[cg]] = FALSE; + } + } + } + cgindex[home_pos] = nat; + + return home_pos; +} + +static void clear_and_mark_ind(int ncg, int *move, + int *index_gl, int *cgindex, int *gatindex, + gmx_ga2la_t ga2la, char *bLocalCG, + int *cell_index) +{ + int cg, a0, a1, a; + + for (cg = 0; cg < ncg; cg++) + { + if (move[cg] >= 0) + { + a0 = cgindex[cg]; + a1 = cgindex[cg+1]; + /* Clear the global indices */ + for (a = a0; a < a1; a++) + { + ga2la_del(ga2la, gatindex[a]); + } + if (bLocalCG) + { + bLocalCG[index_gl[cg]] = FALSE; + } + /* Signal that this cg has moved using the ns cell index. + * Here we set it to -1. fill_grid will change it + * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells. + */ + cell_index[cg] = -1; + } + } +} + +static void print_cg_move(FILE *fplog, + gmx_domdec_t *dd, + gmx_large_int_t step, int cg, int dim, int dir, + gmx_bool bHaveLimitdAndCMOld, real limitd, + rvec cm_old, rvec cm_new, real pos_d) +{ + gmx_domdec_comm_t *comm; + char buf[22]; + + comm = dd->comm; + + fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf)); + if (bHaveLimitdAndCMOld) + { + fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n", + ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim)); + } + else + { + fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n", + ddglatnr(dd, dd->cgindex[cg]), dim2char(dim)); + } + fprintf(fplog, "distance out of cell %f\n", + dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]); + if (bHaveLimitdAndCMOld) + { + fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n", + cm_old[XX], cm_old[YY], cm_old[ZZ]); + } + fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n", + cm_new[XX], cm_new[YY], cm_new[ZZ]); + fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n", + dim2char(dim), + comm->old_cell_x0[dim], comm->old_cell_x1[dim]); + fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n", + dim2char(dim), + comm->cell_x0[dim], comm->cell_x1[dim]); +} + +static void cg_move_error(FILE *fplog, + gmx_domdec_t *dd, + gmx_large_int_t step, int cg, int dim, int dir, + gmx_bool bHaveLimitdAndCMOld, real limitd, + rvec cm_old, rvec cm_new, real pos_d) +{ + if (fplog) + { + print_cg_move(fplog, dd, step, cg, dim, dir, + bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d); + } + print_cg_move(stderr, dd, step, cg, dim, dir, + bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d); + gmx_fatal(FARGS, + "A charge group moved too far between two domain decomposition steps\n" + "This usually means that your system is not well equilibrated"); +} + +static void rotate_state_atom(t_state *state, int a) +{ + int est; + + for (est = 0; est < estNR; est++) + { + if (EST_DISTR(est) && (state->flags & (1<x[a][YY] = state->box[YY][YY] - state->x[a][YY]; + state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ]; + break; + case estV: + state->v[a][YY] = -state->v[a][YY]; + state->v[a][ZZ] = -state->v[a][ZZ]; + break; + case estSDX: + state->sd_X[a][YY] = -state->sd_X[a][YY]; + state->sd_X[a][ZZ] = -state->sd_X[a][ZZ]; + break; + case estCGP: + state->cg_p[a][YY] = -state->cg_p[a][YY]; + state->cg_p[a][ZZ] = -state->cg_p[a][ZZ]; + break; + case estDISRE_INITF: + case estDISRE_RM3TAV: + case estORIRE_INITF: + case estORIRE_DTAV: + /* These are distances, so not affected by rotation */ + break; + default: + gmx_incons("Unknown state entry encountered in rotate_state_atom"); + } + } + } +} + +static int *get_moved(gmx_domdec_comm_t *comm, int natoms) +{ + if (natoms > comm->moved_nalloc) + { + /* Contents should be preserved here */ + comm->moved_nalloc = over_alloc_dd(natoms); + srenew(comm->moved, comm->moved_nalloc); + } + + return comm->moved; +} + +static void calc_cg_move(FILE *fplog, gmx_large_int_t step, + gmx_domdec_t *dd, + t_state *state, + ivec tric_dir, matrix tcm, + rvec cell_x0, rvec cell_x1, + rvec limitd, rvec limit0, rvec limit1, + const int *cgindex, + int cg_start, int cg_end, + rvec *cg_cm, + int *move) +{ + int npbcdim; + int c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d; + int mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec; + int flag; + gmx_bool bScrew; + ivec dev; + real inv_ncg, pos_d; + rvec cm_new; + + npbcdim = dd->npbcdim; + + for (cg = cg_start; cg < cg_end; cg++) + { + k0 = cgindex[cg]; + k1 = cgindex[cg+1]; + nrcg = k1 - k0; + if (nrcg == 1) + { + copy_rvec(state->x[k0], cm_new); + } + else + { + inv_ncg = 1.0/nrcg; + + clear_rvec(cm_new); + for (k = k0; (k < k1); k++) + { + rvec_inc(cm_new, state->x[k]); + } + for (d = 0; (d < DIM); d++) + { + cm_new[d] = inv_ncg*cm_new[d]; + } + } + + clear_ivec(dev); + /* Do pbc and check DD cell boundary crossings */ + for (d = DIM-1; d >= 0; d--) + { + if (dd->nc[d] > 1) + { + bScrew = (dd->bScrewPBC && d == XX); + /* Determine the location of this cg in lattice coordinates */ + pos_d = cm_new[d]; + if (tric_dir[d]) + { + for (d2 = d+1; d2 < DIM; d2++) + { + pos_d += cm_new[d2]*tcm[d2][d]; + } + } + /* Put the charge group in the triclinic unit-cell */ + if (pos_d >= cell_x1[d]) + { + if (pos_d >= limit1[d]) + { + cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d], + cg_cm[cg], cm_new, pos_d); + } + dev[d] = 1; + if (dd->ci[d] == dd->nc[d] - 1) + { + rvec_dec(cm_new, state->box[d]); + if (bScrew) + { + cm_new[YY] = state->box[YY][YY] - cm_new[YY]; + cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ]; + } + for (k = k0; (k < k1); k++) + { + rvec_dec(state->x[k], state->box[d]); + if (bScrew) + { + rotate_state_atom(state, k); + } + } + } + } + else if (pos_d < cell_x0[d]) + { + if (pos_d < limit0[d]) + { + cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d], + cg_cm[cg], cm_new, pos_d); + } + dev[d] = -1; + if (dd->ci[d] == 0) + { + rvec_inc(cm_new, state->box[d]); + if (bScrew) + { + cm_new[YY] = state->box[YY][YY] - cm_new[YY]; + cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ]; + } + for (k = k0; (k < k1); k++) + { + rvec_inc(state->x[k], state->box[d]); + if (bScrew) + { + rotate_state_atom(state, k); + } + } + } + } + } + else if (d < npbcdim) + { + /* Put the charge group in the rectangular unit-cell */ + while (cm_new[d] >= state->box[d][d]) + { + rvec_dec(cm_new, state->box[d]); + for (k = k0; (k < k1); k++) + { + rvec_dec(state->x[k], state->box[d]); + } + } + while (cm_new[d] < 0) + { + rvec_inc(cm_new, state->box[d]); + for (k = k0; (k < k1); k++) + { + rvec_inc(state->x[k], state->box[d]); + } + } + } + } + + copy_rvec(cm_new, cg_cm[cg]); + + /* Determine where this cg should go */ + flag = 0; + mc = -1; + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + if (dev[dim] == 1) + { + flag |= DD_FLAG_FW(d); + if (mc == -1) + { + mc = d*2; + } + } + else if (dev[dim] == -1) + { + flag |= DD_FLAG_BW(d); + if (mc == -1) + { + if (dd->nc[dim] > 2) + { + mc = d*2 + 1; + } + else + { + mc = d*2; + } + } + } + } + /* Temporarily store the flag in move */ + move[cg] = mc + flag; + } +} + +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step, + gmx_domdec_t *dd, ivec tric_dir, + t_state *state, rvec **f, + t_forcerec *fr, t_mdatoms *md, + gmx_bool bCompact, + t_nrnb *nrnb, + int *ncg_stay_home, + int *ncg_moved) +{ + int *move; + int npbcdim; + int ncg[DIM*2], nat[DIM*2]; + int c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d; + int mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec; + int sbuf[2], rbuf[2]; + int home_pos_cg, home_pos_at, buf_pos; + int flag; + gmx_bool bV = FALSE, bSDX = FALSE, bCGP = FALSE; + gmx_bool bScrew; + ivec dev; + real inv_ncg, pos_d; + matrix tcm; + rvec *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new; + atom_id *cgindex; + cginfo_mb_t *cginfo_mb; + gmx_domdec_comm_t *comm; + int *moved; + int nthread, thread; + + if (dd->bScrewPBC) + { + check_screw_box(state->box); + } + + comm = dd->comm; + if (fr->cutoff_scheme == ecutsGROUP) + { + cg_cm = fr->cg_cm; + } + + for (i = 0; i < estNR; i++) + { + if (EST_DISTR(i)) + { + switch (i) + { + case estX: /* Always present */ break; + case estV: bV = (state->flags & (1<flags & (1<flags & (1<ncg_tot > comm->nalloc_int) + { + comm->nalloc_int = over_alloc_dd(dd->ncg_tot); + srenew(comm->buf_int, comm->nalloc_int); + } + move = comm->buf_int; + + /* Clear the count */ + for (c = 0; c < dd->ndim*2; c++) + { + ncg[c] = 0; + nat[c] = 0; + } + + npbcdim = dd->npbcdim; + + for (d = 0; (d < DIM); d++) + { + limitd[d] = dd->comm->cellsize_min[d]; + if (d >= npbcdim && dd->ci[d] == 0) + { + cell_x0[d] = -GMX_FLOAT_MAX; + } + else + { + cell_x0[d] = comm->cell_x0[d]; + } + if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1) + { + cell_x1[d] = GMX_FLOAT_MAX; + } + else + { + cell_x1[d] = comm->cell_x1[d]; + } + if (d < npbcdim) + { + limit0[d] = comm->old_cell_x0[d] - limitd[d]; + limit1[d] = comm->old_cell_x1[d] + limitd[d]; + } + else + { + /* We check after communication if a charge group moved + * more than one cell. Set the pre-comm check limit to float_max. + */ + limit0[d] = -GMX_FLOAT_MAX; + limit1[d] = GMX_FLOAT_MAX; + } + } + + make_tric_corr_matrix(npbcdim, state->box, tcm); + + cgindex = dd->cgindex; + + nthread = gmx_omp_nthreads_get(emntDomdec); + + /* Compute the center of geometry for all home charge groups + * and put them in the box and determine where they should go. + */ +#pragma omp parallel for num_threads(nthread) schedule(static) + for (thread = 0; thread < nthread; thread++) + { + calc_cg_move(fplog, step, dd, state, tric_dir, tcm, + cell_x0, cell_x1, limitd, limit0, limit1, + cgindex, + ( thread *dd->ncg_home)/nthread, + ((thread+1)*dd->ncg_home)/nthread, + fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x, + move); + } + + for (cg = 0; cg < dd->ncg_home; cg++) + { + if (move[cg] >= 0) + { + mc = move[cg]; + flag = mc & ~DD_FLAG_NRCG; + mc = mc & DD_FLAG_NRCG; + move[cg] = mc; + + if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc]) + { + comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1); + srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS); + } + comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg]; + /* We store the cg size in the lower 16 bits + * and the place where the charge group should go + * in the next 6 bits. This saves some communication volume. + */ + nrcg = cgindex[cg+1] - cgindex[cg]; + comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag; + ncg[mc] += 1; + nat[mc] += nrcg; + } + } + + inc_nrnb(nrnb, eNR_CGCM, dd->nat_home); + inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home); + + *ncg_moved = 0; + for (i = 0; i < dd->ndim*2; i++) + { + *ncg_moved += ncg[i]; + } + + nvec = 1; + if (bV) + { + nvec++; + } + if (bSDX) + { + nvec++; + } + if (bCGP) + { + nvec++; + } + + /* Make sure the communication buffers are large enough */ + for (mc = 0; mc < dd->ndim*2; mc++) + { + nvr = ncg[mc] + nat[mc]*nvec; + if (nvr > comm->cgcm_state_nalloc[mc]) + { + comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr); + srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]); + } + } + + switch (fr->cutoff_scheme) + { + case ecutsGROUP: + /* Recalculating cg_cm might be cheaper than communicating, + * but that could give rise to rounding issues. + */ + home_pos_cg = + compact_and_copy_vec_cg(dd->ncg_home, move, cgindex, + nvec, cg_cm, comm, bCompact); + break; + case ecutsVERLET: + /* Without charge groups we send the moved atom coordinates + * over twice. This is so the code below can be used without + * many conditionals for both for with and without charge groups. + */ + home_pos_cg = + compact_and_copy_vec_cg(dd->ncg_home, move, cgindex, + nvec, state->x, comm, FALSE); + if (bCompact) + { + home_pos_cg -= *ncg_moved; + } + break; + default: + gmx_incons("unimplemented"); + home_pos_cg = 0; + } + + vec = 0; + home_pos_at = + compact_and_copy_vec_at(dd->ncg_home, move, cgindex, + nvec, vec++, state->x, comm, bCompact); + if (bV) + { + compact_and_copy_vec_at(dd->ncg_home, move, cgindex, + nvec, vec++, state->v, comm, bCompact); + } + if (bSDX) + { + compact_and_copy_vec_at(dd->ncg_home, move, cgindex, + nvec, vec++, state->sd_X, comm, bCompact); + } + if (bCGP) + { + compact_and_copy_vec_at(dd->ncg_home, move, cgindex, + nvec, vec++, state->cg_p, comm, bCompact); + } + + if (bCompact) + { + compact_ind(dd->ncg_home, move, + dd->index_gl, dd->cgindex, dd->gatindex, + dd->ga2la, comm->bLocalCG, + fr->cginfo); + } + else + { + if (fr->cutoff_scheme == ecutsVERLET) + { + moved = get_moved(comm, dd->ncg_home); + + for (k = 0; k < dd->ncg_home; k++) + { + moved[k] = 0; + } + } + else + { + moved = fr->ns.grid->cell_index; + } + + clear_and_mark_ind(dd->ncg_home, move, + dd->index_gl, dd->cgindex, dd->gatindex, + dd->ga2la, comm->bLocalCG, + moved); + } + + cginfo_mb = fr->cginfo_mb; + + *ncg_stay_home = home_pos_cg; + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + ncg_recv = 0; + nat_recv = 0; + nvr = 0; + for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++) + { + cdd = d*2 + dir; + /* Communicate the cg and atom counts */ + sbuf[0] = ncg[cdd]; + sbuf[1] = nat[cdd]; + if (debug) + { + fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n", + d, dir, sbuf[0], sbuf[1]); + } + dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2); + + if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int) + { + comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS); + srenew(comm->buf_int, comm->nalloc_int); + } + + /* Communicate the charge group indices, sizes and flags */ + dd_sendrecv_int(dd, d, dir, + comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS, + comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS); + + nvs = ncg[cdd] + nat[cdd]*nvec; + i = rbuf[0] + rbuf[1] *nvec; + vec_rvec_check_alloc(&comm->vbuf, nvr+i); + + /* Communicate cgcm and state */ + dd_sendrecv_rvec(dd, d, dir, + comm->cgcm_state[cdd], nvs, + comm->vbuf.v+nvr, i); + ncg_recv += rbuf[0]; + nat_recv += rbuf[1]; + nvr += i; + } + + /* Process the received charge groups */ + buf_pos = 0; + for (cg = 0; cg < ncg_recv; cg++) + { + flag = comm->buf_int[cg*DD_CGIBS+1]; + + if (dim >= npbcdim && dd->nc[dim] > 2) + { + /* No pbc in this dim and more than one domain boundary. + * We do a separate check if a charge group didn't move too far. + */ + if (((flag & DD_FLAG_FW(d)) && + comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) || + ((flag & DD_FLAG_BW(d)) && + comm->vbuf.v[buf_pos][dim] < cell_x0[dim])) + { + cg_move_error(fplog, dd, step, cg, dim, + (flag & DD_FLAG_FW(d)) ? 1 : 0, + FALSE, 0, + comm->vbuf.v[buf_pos], + comm->vbuf.v[buf_pos], + comm->vbuf.v[buf_pos][dim]); + } + } + + mc = -1; + if (d < dd->ndim-1) + { + /* Check which direction this cg should go */ + for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++) + { + if (dd->bGridJump) + { + /* The cell boundaries for dimension d2 are not equal + * for each cell row of the lower dimension(s), + * therefore we might need to redetermine where + * this cg should go. + */ + dim2 = dd->dim[d2]; + /* If this cg crosses the box boundary in dimension d2 + * we can use the communicated flag, so we do not + * have to worry about pbc. + */ + if (!((dd->ci[dim2] == dd->nc[dim2]-1 && + (flag & DD_FLAG_FW(d2))) || + (dd->ci[dim2] == 0 && + (flag & DD_FLAG_BW(d2))))) + { + /* Clear the two flags for this dimension */ + flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2)); + /* Determine the location of this cg + * in lattice coordinates + */ + pos_d = comm->vbuf.v[buf_pos][dim2]; + if (tric_dir[dim2]) + { + for (d3 = dim2+1; d3 < DIM; d3++) + { + pos_d += + comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2]; + } + } + /* Check of we are not at the box edge. + * pbc is only handled in the first step above, + * but this check could move over pbc while + * the first step did not due to different rounding. + */ + if (pos_d >= cell_x1[dim2] && + dd->ci[dim2] != dd->nc[dim2]-1) + { + flag |= DD_FLAG_FW(d2); + } + else if (pos_d < cell_x0[dim2] && + dd->ci[dim2] != 0) + { + flag |= DD_FLAG_BW(d2); + } + comm->buf_int[cg*DD_CGIBS+1] = flag; + } + } + /* Set to which neighboring cell this cg should go */ + if (flag & DD_FLAG_FW(d2)) + { + mc = d2*2; + } + else if (flag & DD_FLAG_BW(d2)) + { + if (dd->nc[dd->dim[d2]] > 2) + { + mc = d2*2+1; + } + else + { + mc = d2*2; + } + } + } + } + + nrcg = flag & DD_FLAG_NRCG; + if (mc == -1) + { + if (home_pos_cg+1 > dd->cg_nalloc) + { + dd->cg_nalloc = over_alloc_dd(home_pos_cg+1); + srenew(dd->index_gl, dd->cg_nalloc); + srenew(dd->cgindex, dd->cg_nalloc+1); + } + /* Set the global charge group index and size */ + dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS]; + dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg; + /* Copy the state from the buffer */ + dd_check_alloc_ncg(fr, state, f, home_pos_cg+1); + if (fr->cutoff_scheme == ecutsGROUP) + { + cg_cm = fr->cg_cm; + copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]); + } + buf_pos++; + + /* Set the cginfo */ + fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb, + dd->index_gl[home_pos_cg]); + if (comm->bLocalCG) + { + comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE; + } + + if (home_pos_at+nrcg > state->nalloc) + { + dd_realloc_state(state, f, home_pos_at+nrcg); + } + for (i = 0; i < nrcg; i++) + { + copy_rvec(comm->vbuf.v[buf_pos++], + state->x[home_pos_at+i]); + } + if (bV) + { + for (i = 0; i < nrcg; i++) + { + copy_rvec(comm->vbuf.v[buf_pos++], + state->v[home_pos_at+i]); + } + } + if (bSDX) + { + for (i = 0; i < nrcg; i++) + { + copy_rvec(comm->vbuf.v[buf_pos++], + state->sd_X[home_pos_at+i]); + } + } + if (bCGP) + { + for (i = 0; i < nrcg; i++) + { + copy_rvec(comm->vbuf.v[buf_pos++], + state->cg_p[home_pos_at+i]); + } + } + home_pos_cg += 1; + home_pos_at += nrcg; + } + else + { + /* Reallocate the buffers if necessary */ + if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc]) + { + comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1); + srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS); + } + nvr = ncg[mc] + nat[mc]*nvec; + if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc]) + { + comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec); + srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]); + } + /* Copy from the receive to the send buffers */ + memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS, + comm->buf_int + cg*DD_CGIBS, + DD_CGIBS*sizeof(int)); + memcpy(comm->cgcm_state[mc][nvr], + comm->vbuf.v[buf_pos], + (1+nrcg*nvec)*sizeof(rvec)); + buf_pos += 1 + nrcg*nvec; + ncg[mc] += 1; + nat[mc] += nrcg; + } + } + } + + /* With sorting (!bCompact) the indices are now only partially up to date + * and ncg_home and nat_home are not the real count, since there are + * "holes" in the arrays for the charge groups that moved to neighbors. + */ + if (fr->cutoff_scheme == ecutsVERLET) + { + moved = get_moved(comm, home_pos_cg); + + for (i = dd->ncg_home; i < home_pos_cg; i++) + { + moved[i] = 0; + } + } + dd->ncg_home = home_pos_cg; + dd->nat_home = home_pos_at; + + if (debug) + { + fprintf(debug, + "Finished repartitioning: cgs moved out %d, new home %d\n", + *ncg_moved, dd->ncg_home-*ncg_moved); + + } +} + +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl) +{ + dd->comm->cycl[ddCycl] += cycles; + dd->comm->cycl_n[ddCycl]++; + if (cycles > dd->comm->cycl_max[ddCycl]) + { + dd->comm->cycl_max[ddCycl] = cycles; + } +} + +static double force_flop_count(t_nrnb *nrnb) +{ + int i; + double sum; + const char *name; + + sum = 0; + for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++) + { + /* To get closer to the real timings, we half the count + * for the normal loops and again half it for water loops. + */ + name = nrnb_str(i); + if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL) + { + sum += nrnb->n[i]*0.25*cost_nrnb(i); + } + else + { + sum += nrnb->n[i]*0.50*cost_nrnb(i); + } + } + for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++) + { + name = nrnb_str(i); + if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL) + { + sum += nrnb->n[i]*cost_nrnb(i); + } + } + for (i = eNR_BONDS; i <= eNR_WALLS; i++) + { + sum += nrnb->n[i]*cost_nrnb(i); + } + + return sum; +} + +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb) +{ + if (dd->comm->eFlop) + { + dd->comm->flop -= force_flop_count(nrnb); + } +} +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb) +{ + if (dd->comm->eFlop) + { + dd->comm->flop += force_flop_count(nrnb); + dd->comm->flop_n++; + } +} + +static void clear_dd_cycle_counts(gmx_domdec_t *dd) +{ + int i; + + for (i = 0; i < ddCyclNr; i++) + { + dd->comm->cycl[i] = 0; + dd->comm->cycl_n[i] = 0; + dd->comm->cycl_max[i] = 0; + } + dd->comm->flop = 0; + dd->comm->flop_n = 0; +} + +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) +{ + gmx_domdec_comm_t *comm; + gmx_domdec_load_t *load; + gmx_domdec_root_t *root = NULL; + int d, dim, cid, i, pos; + float cell_frac = 0, sbuf[DD_NLOAD_MAX]; + gmx_bool bSepPME; + + if (debug) + { + fprintf(debug, "get_load_distribution start\n"); + } + + wallcycle_start(wcycle, ewcDDCOMMLOAD); + + comm = dd->comm; + + bSepPME = (dd->pme_nodeid >= 0); + + for (d = dd->ndim-1; d >= 0; d--) + { + dim = dd->dim[d]; + /* Check if we participate in the communication in this dimension */ + if (d == dd->ndim-1 || + (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0)) + { + load = &comm->load[d]; + if (dd->bGridJump) + { + cell_frac = comm->cell_f1[d] - comm->cell_f0[d]; + } + pos = 0; + if (d == dd->ndim-1) + { + sbuf[pos++] = dd_force_load(comm); + sbuf[pos++] = sbuf[0]; + if (dd->bGridJump) + { + sbuf[pos++] = sbuf[0]; + sbuf[pos++] = cell_frac; + if (d > 0) + { + sbuf[pos++] = comm->cell_f_max0[d]; + sbuf[pos++] = comm->cell_f_min1[d]; + } + } + if (bSepPME) + { + sbuf[pos++] = comm->cycl[ddCyclPPduringPME]; + sbuf[pos++] = comm->cycl[ddCyclPME]; + } + } + else + { + sbuf[pos++] = comm->load[d+1].sum; + sbuf[pos++] = comm->load[d+1].max; + if (dd->bGridJump) + { + sbuf[pos++] = comm->load[d+1].sum_m; + sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac; + sbuf[pos++] = comm->load[d+1].flags; + if (d > 0) + { + sbuf[pos++] = comm->cell_f_max0[d]; + sbuf[pos++] = comm->cell_f_min1[d]; + } + } + if (bSepPME) + { + sbuf[pos++] = comm->load[d+1].mdf; + sbuf[pos++] = comm->load[d+1].pme; + } + } + load->nload = pos; + /* Communicate a row in DD direction d. + * The communicators are setup such that the root always has rank 0. + */ +#ifdef GMX_MPI + MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE, + load->load, load->nload*sizeof(float), MPI_BYTE, + 0, comm->mpi_comm_load[d]); +#endif + if (dd->ci[dim] == dd->master_ci[dim]) + { + /* We are the root, process this row */ + if (comm->bDynLoadBal) + { + root = comm->root[d]; + } + load->sum = 0; + load->max = 0; + load->sum_m = 0; + load->cvol_min = 1; + load->flags = 0; + load->mdf = 0; + load->pme = 0; + pos = 0; + for (i = 0; i < dd->nc[dim]; i++) + { + load->sum += load->load[pos++]; + load->max = max(load->max, load->load[pos]); + pos++; + if (dd->bGridJump) + { + if (root->bLimited) + { + /* This direction could not be load balanced properly, + * therefore we need to use the maximum iso the average load. + */ + load->sum_m = max(load->sum_m, load->load[pos]); + } + else + { + load->sum_m += load->load[pos]; + } + pos++; + load->cvol_min = min(load->cvol_min, load->load[pos]); + pos++; + if (d < dd->ndim-1) + { + load->flags = (int)(load->load[pos++] + 0.5); + } + if (d > 0) + { + root->cell_f_max0[i] = load->load[pos++]; + root->cell_f_min1[i] = load->load[pos++]; + } + } + if (bSepPME) + { + load->mdf = max(load->mdf, load->load[pos]); + pos++; + load->pme = max(load->pme, load->load[pos]); + pos++; + } + } + if (comm->bDynLoadBal && root->bLimited) + { + load->sum_m *= dd->nc[dim]; + load->flags |= (1<nload += dd_load_count(comm); + comm->load_step += comm->cycl[ddCyclStep]; + comm->load_sum += comm->load[0].sum; + comm->load_max += comm->load[0].max; + if (comm->bDynLoadBal) + { + for (d = 0; d < dd->ndim; d++) + { + if (comm->load[0].flags & (1<load_lim[d]++; + } + } + } + if (bSepPME) + { + comm->load_mdf += comm->load[0].mdf; + comm->load_pme += comm->load[0].pme; + } + } + + wallcycle_stop(wcycle, ewcDDCOMMLOAD); + + if (debug) + { + fprintf(debug, "get_load_distribution finished\n"); + } +} + +static float dd_force_imb_perf_loss(gmx_domdec_t *dd) +{ + /* Return the relative performance loss on the total run time + * due to the force calculation load imbalance. + */ + if (dd->comm->nload > 0) + { + return + (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/ + (dd->comm->load_step*dd->nnodes); + } + else + { + return 0; + } +} + +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd) +{ + char buf[STRLEN]; + int npp, npme, nnodes, d, limp; + float imbal, pme_f_ratio, lossf, lossp = 0; + gmx_bool bLim; + gmx_domdec_comm_t *comm; + + comm = dd->comm; + if (DDMASTER(dd) && comm->nload > 0) + { + npp = dd->nnodes; + npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0; + nnodes = npp + npme; + imbal = comm->load_max*npp/comm->load_sum - 1; + lossf = dd_force_imb_perf_loss(dd); + sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100); + fprintf(fplog, "%s", buf); + fprintf(stderr, "\n"); + fprintf(stderr, "%s", buf); + sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100); + fprintf(fplog, "%s", buf); + fprintf(stderr, "%s", buf); + bLim = FALSE; + if (comm->bDynLoadBal) + { + sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:"); + for (d = 0; d < dd->ndim; d++) + { + limp = (200*comm->load_lim[d]+1)/(2*comm->nload); + sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp); + if (limp >= 50) + { + bLim = TRUE; + } + } + sprintf(buf+strlen(buf), "\n"); + fprintf(fplog, "%s", buf); + fprintf(stderr, "%s", buf); + } + if (npme > 0) + { + pme_f_ratio = comm->load_pme/comm->load_mdf; + lossp = (comm->load_pme -comm->load_mdf)/comm->load_step; + if (lossp <= 0) + { + lossp *= (float)npme/(float)nnodes; + } + else + { + lossp *= (float)npp/(float)nnodes; + } + sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio); + fprintf(fplog, "%s", buf); + fprintf(stderr, "%s", buf); + sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100); + fprintf(fplog, "%s", buf); + fprintf(stderr, "%s", buf); + } + fprintf(fplog, "\n"); + fprintf(stderr, "\n"); + + if (lossf >= DD_PERF_LOSS) + { + sprintf(buf, + "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n" + " in the domain decomposition.\n", lossf*100); + if (!comm->bDynLoadBal) + { + sprintf(buf+strlen(buf), " You might want to use dynamic load balancing (option -dlb.)\n"); + } + else if (bLim) + { + sprintf(buf+strlen(buf), " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n"); + } + fprintf(fplog, "%s\n", buf); + fprintf(stderr, "%s\n", buf); + } + if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS) + { + sprintf(buf, + "NOTE: %.1f %% performance was lost because the PME nodes\n" + " had %s work to do than the PP nodes.\n" + " You might want to %s the number of PME nodes\n" + " or %s the cut-off and the grid spacing.\n", + fabs(lossp*100), + (lossp < 0) ? "less" : "more", + (lossp < 0) ? "decrease" : "increase", + (lossp < 0) ? "decrease" : "increase"); + fprintf(fplog, "%s\n", buf); + fprintf(stderr, "%s\n", buf); + } + } +} + +static float dd_vol_min(gmx_domdec_t *dd) +{ + return dd->comm->load[0].cvol_min*dd->nnodes; +} + +static gmx_bool dd_load_flags(gmx_domdec_t *dd) +{ + return dd->comm->load[0].flags; +} + +static float dd_f_imbal(gmx_domdec_t *dd) +{ + return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1; +} + +float dd_pme_f_ratio(gmx_domdec_t *dd) +{ + if (dd->comm->cycl_n[ddCyclPME] > 0) + { + return dd->comm->load[0].pme/dd->comm->load[0].mdf; + } + else + { + return -1.0; + } +} + +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step) +{ + int flags, d; + char buf[22]; + + flags = dd_load_flags(dd); + if (flags) + { + fprintf(fplog, + "DD load balancing is limited by minimum cell size in dimension"); + for (d = 0; d < dd->ndim; d++) + { + if (flags & (1<dim[d])); + } + } + fprintf(fplog, "\n"); + } + fprintf(fplog, "DD step %s", gmx_step_str(step, buf)); + if (dd->comm->bDynLoadBal) + { + fprintf(fplog, " vol min/aver %5.3f%c", + dd_vol_min(dd), flags ? '!' : ' '); + } + fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100); + if (dd->comm->cycl_n[ddCyclPME]) + { + fprintf(fplog, " pme mesh/force %5.3f", dd_pme_f_ratio(dd)); + } + fprintf(fplog, "\n\n"); +} + +static void dd_print_load_verbose(gmx_domdec_t *dd) +{ + if (dd->comm->bDynLoadBal) + { + fprintf(stderr, "vol %4.2f%c ", + dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' '); + } + fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5)); + if (dd->comm->cycl_n[ddCyclPME]) + { + fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd)); + } +} + +#ifdef GMX_MPI +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc) +{ + MPI_Comm c_row; + int dim, i, rank; + ivec loc_c; + gmx_domdec_root_t *root; + gmx_bool bPartOfGroup = FALSE; + + dim = dd->dim[dim_ind]; + copy_ivec(loc, loc_c); + for (i = 0; i < dd->nc[dim]; i++) + { + loc_c[dim] = i; + rank = dd_index(dd->nc, loc_c); + if (rank == dd->rank) + { + /* This process is part of the group */ + bPartOfGroup = TRUE; + } + } + MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, + &c_row); + if (bPartOfGroup) + { + dd->comm->mpi_comm_load[dim_ind] = c_row; + if (dd->comm->eDLB != edlbNO) + { + if (dd->ci[dim] == dd->master_ci[dim]) + { + /* This is the root process of this row */ + snew(dd->comm->root[dim_ind], 1); + root = dd->comm->root[dim_ind]; + snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind)); + snew(root->old_cell_f, dd->nc[dim]+1); + snew(root->bCellMin, dd->nc[dim]); + if (dim_ind > 0) + { + snew(root->cell_f_max0, dd->nc[dim]); + snew(root->cell_f_min1, dd->nc[dim]); + snew(root->bound_min, dd->nc[dim]); + snew(root->bound_max, dd->nc[dim]); + } + snew(root->buf_ncd, dd->nc[dim]); + } + else + { + /* This is not a root process, we only need to receive cell_f */ + snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind)); + } + } + if (dd->ci[dim] == dd->master_ci[dim]) + { + snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX); + } + } +} +#endif + +static void make_load_communicators(gmx_domdec_t *dd) +{ +#ifdef GMX_MPI + int dim0, dim1, i, j; + ivec loc; + + if (debug) + { + fprintf(debug, "Making load communicators\n"); + } + + snew(dd->comm->load, dd->ndim); + snew(dd->comm->mpi_comm_load, dd->ndim); + + clear_ivec(loc); + make_load_communicator(dd, 0, loc); + if (dd->ndim > 1) + { + dim0 = dd->dim[0]; + for (i = 0; i < dd->nc[dim0]; i++) + { + loc[dim0] = i; + make_load_communicator(dd, 1, loc); + } + } + if (dd->ndim > 2) + { + dim0 = dd->dim[0]; + for (i = 0; i < dd->nc[dim0]; i++) + { + loc[dim0] = i; + dim1 = dd->dim[1]; + for (j = 0; j < dd->nc[dim1]; j++) + { + loc[dim1] = j; + make_load_communicator(dd, 2, loc); + } + } + } + + if (debug) + { + fprintf(debug, "Finished making load communicators\n"); + } +#endif +} + +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd) +{ + gmx_bool bZYX; + int d, dim, i, j, m; + ivec tmp, s; + int nzone, nzonep; + ivec dd_zp[DD_MAXIZONE]; + gmx_domdec_zones_t *zones; + gmx_domdec_ns_ranges_t *izone; + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + copy_ivec(dd->ci, tmp); + tmp[dim] = (tmp[dim] + 1) % dd->nc[dim]; + dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp); + copy_ivec(dd->ci, tmp); + tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim]; + dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp); + if (debug) + { + fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n", + dd->rank, dim, + dd->neighbor[d][0], + dd->neighbor[d][1]); + } + } + + if (fplog) + { + fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n", + dd->ndim, + dd->nc[XX], dd->nc[YY], dd->nc[ZZ], + dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } + switch (dd->ndim) + { + case 3: + nzone = dd_z3n; + nzonep = dd_zp3n; + for (i = 0; i < nzonep; i++) + { + copy_ivec(dd_zp3[i], dd_zp[i]); + } + break; + case 2: + nzone = dd_z2n; + nzonep = dd_zp2n; + for (i = 0; i < nzonep; i++) + { + copy_ivec(dd_zp2[i], dd_zp[i]); + } + break; + case 1: + nzone = dd_z1n; + nzonep = dd_zp1n; + for (i = 0; i < nzonep; i++) + { + copy_ivec(dd_zp1[i], dd_zp[i]); + } + break; + default: + gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition"); + nzone = 0; + nzonep = 0; + } + + zones = &dd->comm->zones; + + for (i = 0; i < nzone; i++) + { + m = 0; + clear_ivec(zones->shift[i]); + for (d = 0; d < dd->ndim; d++) + { + zones->shift[i][dd->dim[d]] = dd_zo[i][m++]; + } + } + + zones->n = nzone; + for (i = 0; i < nzone; i++) + { + for (d = 0; d < DIM; d++) + { + s[d] = dd->ci[d] - zones->shift[i][d]; + if (s[d] < 0) + { + s[d] += dd->nc[d]; + } + else if (s[d] >= dd->nc[d]) + { + s[d] -= dd->nc[d]; + } + } + } + zones->nizone = nzonep; + for (i = 0; i < zones->nizone; i++) + { + if (dd_zp[i][0] != i) + { + gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup"); + } + izone = &zones->izone[i]; + izone->j0 = dd_zp[i][1]; + izone->j1 = dd_zp[i][2]; + for (dim = 0; dim < DIM; dim++) + { + if (dd->nc[dim] == 1) + { + /* All shifts should be allowed */ + izone->shift0[dim] = -1; + izone->shift1[dim] = 1; + } + else + { + /* + izone->shift0[d] = 0; + izone->shift1[d] = 0; + for(j=izone->j0; jj1; j++) { + if (dd->shift[j][d] > dd->shift[i][d]) + izone->shift0[d] = -1; + if (dd->shift[j][d] < dd->shift[i][d]) + izone->shift1[d] = 1; + } + */ + + int shift_diff; + + /* Assume the shift are not more than 1 cell */ + izone->shift0[dim] = 1; + izone->shift1[dim] = -1; + for (j = izone->j0; j < izone->j1; j++) + { + shift_diff = zones->shift[j][dim] - zones->shift[i][dim]; + if (shift_diff < izone->shift0[dim]) + { + izone->shift0[dim] = shift_diff; + } + if (shift_diff > izone->shift1[dim]) + { + izone->shift1[dim] = shift_diff; + } + } + } + } + } + + if (dd->comm->eDLB != edlbNO) + { + snew(dd->comm->root, dd->ndim); + } + + if (dd->comm->bRecordLoad) + { + make_load_communicators(dd); + } +} + +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + int i, rank, *buf; + ivec periods; +#ifdef GMX_MPI + MPI_Comm comm_cart; +#endif + + dd = cr->dd; + comm = dd->comm; + +#ifdef GMX_MPI + if (comm->bCartesianPP) + { + /* Set up cartesian communication for the particle-particle part */ + if (fplog) + { + fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n", + dd->nc[XX], dd->nc[YY], dd->nc[ZZ]); + } + + for (i = 0; i < DIM; i++) + { + periods[i] = TRUE; + } + MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder, + &comm_cart); + /* We overwrite the old communicator with the new cartesian one */ + cr->mpi_comm_mygroup = comm_cart; + } + + dd->mpi_comm_all = cr->mpi_comm_mygroup; + MPI_Comm_rank(dd->mpi_comm_all, &dd->rank); + + if (comm->bCartesianPP_PME) + { + /* Since we want to use the original cartesian setup for sim, + * and not the one after split, we need to make an index. + */ + snew(comm->ddindex2ddnodeid, dd->nnodes); + comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank; + gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr); + /* Get the rank of the DD master, + * above we made sure that the master node is a PP node. + */ + if (MASTER(cr)) + { + rank = dd->rank; + } + else + { + rank = 0; + } + MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all); + } + else if (comm->bCartesianPP) + { + if (cr->npmenodes == 0) + { + /* The PP communicator is also + * the communicator for this simulation + */ + cr->mpi_comm_mysim = cr->mpi_comm_mygroup; + } + cr->nodeid = dd->rank; + + MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci); + + /* We need to make an index to go from the coordinates + * to the nodeid of this simulation. + */ + snew(comm->ddindex2simnodeid, dd->nnodes); + snew(buf, dd->nnodes); + if (cr->duty & DUTY_PP) + { + buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid; + } + /* Communicate the ddindex to simulation nodeid index */ + MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM, + cr->mpi_comm_mysim); + sfree(buf); + + /* Determine the master coordinates and rank. + * The DD master should be the same node as the master of this sim. + */ + for (i = 0; i < dd->nnodes; i++) + { + if (comm->ddindex2simnodeid[i] == 0) + { + ddindex2xyz(dd->nc, i, dd->master_ci); + MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank); + } + } + if (debug) + { + fprintf(debug, "The master rank is %d\n", dd->masterrank); + } + } + else + { + /* No Cartesian communicators */ + /* We use the rank in dd->comm->all as DD index */ + ddindex2xyz(dd->nc, dd->rank, dd->ci); + /* The simulation master nodeid is 0, so the DD master rank is also 0 */ + dd->masterrank = 0; + clear_ivec(dd->master_ci); + } +#endif + + if (fplog) + { + fprintf(fplog, + "Domain decomposition nodeid %d, coordinates %d %d %d\n\n", + dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } + if (debug) + { + fprintf(debug, + "Domain decomposition nodeid %d, coordinates %d %d %d\n\n", + dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } +} + +static void receive_ddindex2simnodeid(t_commrec *cr) +{ + gmx_domdec_t *dd; + + gmx_domdec_comm_t *comm; + int *buf; + + dd = cr->dd; + comm = dd->comm; + +#ifdef GMX_MPI + if (!comm->bCartesianPP_PME && comm->bCartesianPP) + { + snew(comm->ddindex2simnodeid, dd->nnodes); + snew(buf, dd->nnodes); + if (cr->duty & DUTY_PP) + { + buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid; + } +#ifdef GMX_MPI + /* Communicate the ddindex to simulation nodeid index */ + MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM, + cr->mpi_comm_mysim); +#endif + sfree(buf); + } +#endif +} + +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd, + int ncg, int natoms) +{ + gmx_domdec_master_t *ma; + int i; + + snew(ma, 1); + + snew(ma->ncg, dd->nnodes); + snew(ma->index, dd->nnodes+1); + snew(ma->cg, ncg); + snew(ma->nat, dd->nnodes); + snew(ma->ibuf, dd->nnodes*2); + snew(ma->cell_x, DIM); + for (i = 0; i < DIM; i++) + { + snew(ma->cell_x[i], dd->nc[i]+1); + } + + if (dd->nnodes <= GMX_DD_NNODES_SENDRECV) + { + ma->vbuf = NULL; + } + else + { + snew(ma->vbuf, natoms); + } + + return ma; +} + +static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order, + int reorder) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + int i, rank; + gmx_bool bDiv[DIM]; + ivec periods; +#ifdef GMX_MPI + MPI_Comm comm_cart; +#endif + + dd = cr->dd; + comm = dd->comm; + + if (comm->bCartesianPP) + { + for (i = 1; i < DIM; i++) + { + bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0); + } + if (bDiv[YY] || bDiv[ZZ]) + { + comm->bCartesianPP_PME = TRUE; + /* If we have 2D PME decomposition, which is always in x+y, + * we stack the PME only nodes in z. + * Otherwise we choose the direction that provides the thinnest slab + * of PME only nodes as this will have the least effect + * on the PP communication. + * But for the PME communication the opposite might be better. + */ + if (bDiv[ZZ] && (comm->npmenodes_y > 1 || + !bDiv[YY] || + dd->nc[YY] > dd->nc[ZZ])) + { + comm->cartpmedim = ZZ; + } + else + { + comm->cartpmedim = YY; + } + comm->ntot[comm->cartpmedim] + += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes; + } + else if (fplog) + { + fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]); + fprintf(fplog, + "Will not use a Cartesian communicator for PP <-> PME\n\n"); + } + } + +#ifdef GMX_MPI + if (comm->bCartesianPP_PME) + { + if (fplog) + { + fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]); + } + + for (i = 0; i < DIM; i++) + { + periods[i] = TRUE; + } + MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder, + &comm_cart); + + MPI_Comm_rank(comm_cart, &rank); + if (MASTERNODE(cr) && rank != 0) + { + gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this"); + } + + /* With this assigment we loose the link to the original communicator + * which will usually be MPI_COMM_WORLD, unless have multisim. + */ + cr->mpi_comm_mysim = comm_cart; + cr->sim_nodeid = rank; + + MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci); + + if (fplog) + { + fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n", + cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]); + } + + if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim]) + { + cr->duty = DUTY_PP; + } + if (cr->npmenodes == 0 || + dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim]) + { + cr->duty = DUTY_PME; + } + + /* Split the sim communicator into PP and PME only nodes */ + MPI_Comm_split(cr->mpi_comm_mysim, + cr->duty, + dd_index(comm->ntot, dd->ci), + &cr->mpi_comm_mygroup); + } + else + { + switch (dd_node_order) + { + case ddnoPP_PME: + if (fplog) + { + fprintf(fplog, "Order of the nodes: PP first, PME last\n"); + } + break; + case ddnoINTERLEAVE: + /* Interleave the PP-only and PME-only nodes, + * as on clusters with dual-core machines this will double + * the communication bandwidth of the PME processes + * and thus speed up the PP <-> PME and inter PME communication. + */ + if (fplog) + { + fprintf(fplog, "Interleaving PP and PME nodes\n"); + } + comm->pmenodes = dd_pmenodes(cr); + break; + case ddnoCARTESIAN: + break; + default: + gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order); + } + + if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1) + { + cr->duty = DUTY_PME; + } + else + { + cr->duty = DUTY_PP; + } + + /* Split the sim communicator into PP and PME only nodes */ + MPI_Comm_split(cr->mpi_comm_mysim, + cr->duty, + cr->nodeid, + &cr->mpi_comm_mygroup); + MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid); + } +#endif + + if (fplog) + { + fprintf(fplog, "This is a %s only node\n\n", + (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh"); + } +} + +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + int CartReorder; + + dd = cr->dd; + comm = dd->comm; + + copy_ivec(dd->nc, comm->ntot); + + comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN); + comm->bCartesianPP_PME = FALSE; + + /* Reorder the nodes by default. This might change the MPI ranks. + * Real reordering is only supported on very few architectures, + * Blue Gene is one of them. + */ + CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL); + + if (cr->npmenodes > 0) + { + /* Split the communicator into a PP and PME part */ + split_communicator(fplog, cr, dd_node_order, CartReorder); + if (comm->bCartesianPP_PME) + { + /* We (possibly) reordered the nodes in split_communicator, + * so it is no longer required in make_pp_communicator. + */ + CartReorder = FALSE; + } + } + else + { + /* All nodes do PP and PME */ +#ifdef GMX_MPI + /* We do not require separate communicators */ + cr->mpi_comm_mygroup = cr->mpi_comm_mysim; +#endif + } + + if (cr->duty & DUTY_PP) + { + /* Copy or make a new PP communicator */ + make_pp_communicator(fplog, cr, CartReorder); + } + else + { + receive_ddindex2simnodeid(cr); + } + + if (!(cr->duty & DUTY_PME)) + { + /* Set up the commnuication to our PME node */ + dd->pme_nodeid = dd_simnode2pmenode(cr, cr->sim_nodeid); + dd->pme_receive_vir_ener = receive_vir_ener(cr); + if (debug) + { + fprintf(debug, "My pme_nodeid %d receive ener %d\n", + dd->pme_nodeid, dd->pme_receive_vir_ener); + } + } + else + { + dd->pme_nodeid = -1; + } + + if (DDMASTER(dd)) + { + dd->ma = init_gmx_domdec_master_t(dd, + comm->cgs_gl.nr, + comm->cgs_gl.index[comm->cgs_gl.nr]); + } +} + +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string) +{ + real *slb_frac, tot; + int i, n; + double dbl; + + slb_frac = NULL; + if (nc > 1 && size_string != NULL) + { + if (fplog) + { + fprintf(fplog, "Using static load balancing for the %s direction\n", + dir); + } + snew(slb_frac, nc); + tot = 0; + for (i = 0; i < nc; i++) + { + dbl = 0; + sscanf(size_string, "%lf%n", &dbl, &n); + if (dbl == 0) + { + gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string); + } + slb_frac[i] = dbl; + size_string += n; + tot += slb_frac[i]; + } + /* Normalize */ + if (fplog) + { + fprintf(fplog, "Relative cell sizes:"); + } + for (i = 0; i < nc; i++) + { + slb_frac[i] /= tot; + if (fplog) + { + fprintf(fplog, " %5.3f", slb_frac[i]); + } + } + if (fplog) + { + fprintf(fplog, "\n"); + } + } + + return slb_frac; +} + +static int multi_body_bondeds_count(gmx_mtop_t *mtop) +{ + int n, nmol, ftype; + gmx_mtop_ilistloop_t iloop; + t_ilist *il; + + n = 0; + iloop = gmx_mtop_ilistloop_init(mtop); + while (gmx_mtop_ilistloop_next(iloop, &il, &nmol)) + { + for (ftype = 0; ftype < F_NRE; ftype++) + { + if ((interaction_function[ftype].flags & IF_BOND) && + NRAL(ftype) > 2) + { + n += nmol*il[ftype].nr/(1 + NRAL(ftype)); + } + } + } + + return n; +} + +static int dd_nst_env(FILE *fplog, const char *env_var, int def) +{ + char *val; + int nst; + + nst = def; + val = getenv(env_var); + if (val) + { + if (sscanf(val, "%d", &nst) <= 0) + { + nst = 1; + } + if (fplog) + { + fprintf(fplog, "Found env.var. %s = %s, using value %d\n", + env_var, val, nst); + } + } + + return nst; +} + +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string) +{ + if (MASTER(cr)) + { + fprintf(stderr, "\n%s\n", warn_string); + } + if (fplog) + { + fprintf(fplog, "\n%s\n", warn_string); + } +} + +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd, + t_inputrec *ir, FILE *fplog) +{ + if (ir->ePBC == epbcSCREW && + (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1)) + { + gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]); + } + + if (ir->ns_type == ensSIMPLE) + { + gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition"); + } + + if (ir->nstlist == 0) + { + gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0"); + } + + if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE) + { + dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary"); + } +} + +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox) +{ + int di, d; + real r; + + r = ddbox->box_size[XX]; + for (di = 0; di < dd->ndim; di++) + { + d = dd->dim[di]; + /* Check using the initial average cell size */ + r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]); + } + + return r; +} + +static int check_dlb_support(FILE *fplog, t_commrec *cr, + const char *dlb_opt, gmx_bool bRecordLoad, + unsigned long Flags, t_inputrec *ir) +{ + gmx_domdec_t *dd; + int eDLB = -1; + char buf[STRLEN]; + + switch (dlb_opt[0]) + { + case 'a': eDLB = edlbAUTO; break; + case 'n': eDLB = edlbNO; break; + case 'y': eDLB = edlbYES; break; + default: gmx_incons("Unknown dlb_opt"); + } + + if (Flags & MD_RERUN) + { + return edlbNO; + } + + if (!EI_DYNAMICS(ir->eI)) + { + if (eDLB == edlbYES) + { + sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI)); + dd_warning(cr, fplog, buf); + } + + return edlbNO; + } + + if (!bRecordLoad) + { + dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n"); + + return edlbNO; + } + + if (Flags & MD_REPRODUCIBLE) + { + switch (eDLB) + { + case edlbNO: + break; + case edlbAUTO: + dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n"); + eDLB = edlbNO; + break; + case edlbYES: + dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n"); + break; + default: + gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB); + break; + } + } + + return eDLB; +} + +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd) +{ + int dim; + + dd->ndim = 0; + if (getenv("GMX_DD_ORDER_ZYX") != NULL) + { + /* Decomposition order z,y,x */ + if (fplog) + { + fprintf(fplog, "Using domain decomposition order z, y, x\n"); + } + for (dim = DIM-1; dim >= 0; dim--) + { + if (dd->nc[dim] > 1) + { + dd->dim[dd->ndim++] = dim; + } + } + } + else + { + /* Decomposition order x,y,z */ + for (dim = 0; dim < DIM; dim++) + { + if (dd->nc[dim] > 1) + { + dd->dim[dd->ndim++] = dim; + } + } + } +} + +static gmx_domdec_comm_t *init_dd_comm() +{ + gmx_domdec_comm_t *comm; + int i; + + snew(comm, 1); + snew(comm->cggl_flag, DIM*2); + snew(comm->cgcm_state, DIM*2); + for (i = 0; i < DIM*2; i++) + { + comm->cggl_flag_nalloc[i] = 0; + comm->cgcm_state_nalloc[i] = 0; + } + + comm->nalloc_int = 0; + comm->buf_int = NULL; + + vec_rvec_init(&comm->vbuf); + + comm->n_load_have = 0; + comm->n_load_collect = 0; + + for (i = 0; i < ddnatNR-ddnatZONE; i++) + { + comm->sum_nat[i] = 0; + } + comm->ndecomp = 0; + comm->nload = 0; + comm->load_step = 0; + comm->load_sum = 0; + comm->load_max = 0; + clear_ivec(comm->load_lim); + comm->load_mdf = 0; + comm->load_pme = 0; + + return comm; +} + +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, + unsigned long Flags, + ivec nc, + real comm_distance_min, real rconstr, + const char *dlb_opt, real dlb_scale, + const char *sizex, const char *sizey, const char *sizez, + gmx_mtop_t *mtop, t_inputrec *ir, + matrix box, rvec *x, + gmx_ddbox_t *ddbox, + int *npme_x, int *npme_y) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + int recload; + int d, i, j; + real r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs; + gmx_bool bC; + char buf[STRLEN]; + + if (fplog) + { + fprintf(fplog, + "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes); + } + + snew(dd, 1); + + dd->comm = init_dd_comm(); + comm = dd->comm; + snew(comm->cggl_flag, DIM*2); + snew(comm->cgcm_state, DIM*2); + + dd->npbcdim = ePBC2npbcdim(ir->ePBC); + dd->bScrewPBC = (ir->ePBC == epbcSCREW); + + dd->bSendRecv2 = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0); + comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10); + comm->eFlop = dd_nst_env(fplog, "GMX_DLB_FLOP", 0); + recload = dd_nst_env(fplog, "GMX_DD_LOAD", 1); + comm->nstSortCG = dd_nst_env(fplog, "GMX_DD_SORT", 1); + comm->nstDDDump = dd_nst_env(fplog, "GMX_DD_DUMP", 0); + comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0); + comm->DD_debug = dd_nst_env(fplog, "GMX_DD_DEBUG", 0); + + dd->pme_recv_f_alloc = 0; + dd->pme_recv_f_buf = NULL; + + if (dd->bSendRecv2 && fplog) + { + fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n"); + } + if (comm->eFlop) + { + if (fplog) + { + fprintf(fplog, "Will load balance based on FLOP count\n"); + } + if (comm->eFlop > 1) + { + srand(1+cr->nodeid); + } + comm->bRecordLoad = TRUE; + } + else + { + comm->bRecordLoad = (wallcycle_have_counter() && recload > 0); + + } + + comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + + comm->bDynLoadBal = (comm->eDLB == edlbYES); + if (fplog) + { + fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]); + } + dd->bGridJump = comm->bDynLoadBal; + comm->bPMELoadBalDLBLimits = FALSE; + + if (comm->nstSortCG) + { + if (fplog) + { + if (comm->nstSortCG == 1) + { + fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n"); + } + else + { + fprintf(fplog, "Will sort the charge groups every %d steps\n", + comm->nstSortCG); + } + } + snew(comm->sort, 1); + } + else + { + if (fplog) + { + fprintf(fplog, "Will not sort the charge groups\n"); + } + } + + comm->bCGs = (ncg_mtop(mtop) < mtop->natoms); + + comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr); + if (comm->bInterCGBondeds) + { + comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0); + } + else + { + comm->bInterCGMultiBody = FALSE; + } + + dd->bInterCGcons = inter_charge_group_constraints(mtop); + dd->bInterCGsettles = inter_charge_group_settles(mtop); + + if (ir->rlistlong == 0) + { + /* Set the cut-off to some very large value, + * so we don't need if statements everywhere in the code. + * We use sqrt, since the cut-off is squared in some places. + */ + comm->cutoff = GMX_CUTOFF_INF; + } + else + { + comm->cutoff = ir->rlistlong; + } + comm->cutoff_mbody = 0; + + comm->cellsize_limit = 0; + comm->bBondComm = FALSE; + + if (comm->bInterCGBondeds) + { + if (comm_distance_min > 0) + { + comm->cutoff_mbody = comm_distance_min; + if (Flags & MD_DDBONDCOMM) + { + comm->bBondComm = (comm->cutoff_mbody > comm->cutoff); + } + else + { + comm->cutoff = max(comm->cutoff, comm->cutoff_mbody); + } + r_bonded_limit = comm->cutoff_mbody; + } + else if (ir->bPeriodicMols) + { + /* Can not easily determine the required cut-off */ + dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n"); + comm->cutoff_mbody = comm->cutoff/2; + r_bonded_limit = comm->cutoff_mbody; + } + else + { + if (MASTER(cr)) + { + dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box, + Flags & MD_DDBONDCHECK, &r_2b, &r_mb); + } + gmx_bcast(sizeof(r_2b), &r_2b, cr); + gmx_bcast(sizeof(r_mb), &r_mb, cr); + + /* We use an initial margin of 10% for the minimum cell size, + * except when we are just below the non-bonded cut-off. + */ + if (Flags & MD_DDBONDCOMM) + { + if (max(r_2b, r_mb) > comm->cutoff) + { + r_bonded = max(r_2b, r_mb); + r_bonded_limit = 1.1*r_bonded; + comm->bBondComm = TRUE; + } + else + { + r_bonded = r_mb; + r_bonded_limit = min(1.1*r_bonded, comm->cutoff); + } + /* We determine cutoff_mbody later */ + } + else + { + /* No special bonded communication, + * simply increase the DD cut-off. + */ + r_bonded_limit = 1.1*max(r_2b, r_mb); + comm->cutoff_mbody = r_bonded_limit; + comm->cutoff = max(comm->cutoff, comm->cutoff_mbody); + } + } + comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit); + if (fplog) + { + fprintf(fplog, + "Minimum cell size due to bonded interactions: %.3f nm\n", + comm->cellsize_limit); + } + } + + if (dd->bInterCGcons && rconstr <= 0) + { + /* There is a cell size limit due to the constraints (P-LINCS) */ + rconstr = constr_r_max(fplog, mtop, ir); + if (fplog) + { + fprintf(fplog, + "Estimated maximum distance required for P-LINCS: %.3f nm\n", + rconstr); + if (rconstr > comm->cellsize_limit) + { + fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n"); + } + } + } + else if (rconstr > 0 && fplog) + { + /* Here we do not check for dd->bInterCGcons, + * because one can also set a cell size limit for virtual sites only + * and at this point we don't know yet if there are intercg v-sites. + */ + fprintf(fplog, + "User supplied maximum distance required for P-LINCS: %.3f nm\n", + rconstr); + } + comm->cellsize_limit = max(comm->cellsize_limit, rconstr); + + comm->cgs_gl = gmx_mtop_global_cgs(mtop); + + if (nc[XX] > 0) + { + copy_ivec(nc, dd->nc); + set_dd_dim(fplog, dd); + set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox); + + if (cr->npmenodes == -1) + { + cr->npmenodes = 0; + } + acs = average_cellsize_min(dd, ddbox); + if (acs < comm->cellsize_limit) + { + if (fplog) + { + fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit); + } + gmx_fatal_collective(FARGS, cr, NULL, + "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details", + acs, comm->cellsize_limit); + } + } + else + { + set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox); + + /* We need to choose the optimal DD grid and possibly PME nodes */ + limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox, + comm->eDLB != edlbNO, dlb_scale, + comm->cellsize_limit, comm->cutoff, + comm->bInterCGBondeds, comm->bInterCGMultiBody); + + if (dd->nc[XX] == 0) + { + bC = (dd->bInterCGcons && rconstr > r_bonded_limit); + sprintf(buf, "Change the number of nodes or mdrun option %s%s%s", + !bC ? "-rdd" : "-rcon", + comm->eDLB != edlbNO ? " or -dds" : "", + bC ? " or your LINCS settings" : ""); + + gmx_fatal_collective(FARGS, cr, NULL, + "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n" + "%s\n" + "Look in the log file for details on the domain decomposition", + cr->nnodes-cr->npmenodes, limit, buf); + } + set_dd_dim(fplog, dd); + } + + if (fplog) + { + fprintf(fplog, + "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n", + dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes); + } + + dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ]; + if (cr->nnodes - dd->nnodes != cr->npmenodes) + { + gmx_fatal_collective(FARGS, cr, NULL, + "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d", + dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes); + } + if (cr->npmenodes > dd->nnodes) + { + gmx_fatal_collective(FARGS, cr, NULL, + "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes); + } + if (cr->npmenodes > 0) + { + comm->npmenodes = cr->npmenodes; + } + else + { + comm->npmenodes = dd->nnodes; + } + + if (EEL_PME(ir->coulombtype)) + { + /* The following choices should match those + * in comm_cost_est in domdec_setup.c. + * Note that here the checks have to take into account + * that the decomposition might occur in a different order than xyz + * (for instance through the env.var. GMX_DD_ORDER_ZYX), + * in which case they will not match those in comm_cost_est, + * but since that is mainly for testing purposes that's fine. + */ + if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY && + comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 && + getenv("GMX_PMEONEDD") == NULL) + { + comm->npmedecompdim = 2; + comm->npmenodes_x = dd->nc[XX]; + comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x; + } + else + { + /* In case nc is 1 in both x and y we could still choose to + * decompose pme in y instead of x, but we use x for simplicity. + */ + comm->npmedecompdim = 1; + if (dd->dim[0] == YY) + { + comm->npmenodes_x = 1; + comm->npmenodes_y = comm->npmenodes; + } + else + { + comm->npmenodes_x = comm->npmenodes; + comm->npmenodes_y = 1; + } + } + if (fplog) + { + fprintf(fplog, "PME domain decomposition: %d x %d x %d\n", + comm->npmenodes_x, comm->npmenodes_y, 1); + } + } + else + { + comm->npmedecompdim = 0; + comm->npmenodes_x = 0; + comm->npmenodes_y = 0; + } + + /* Technically we don't need both of these, + * but it simplifies code not having to recalculate it. + */ + *npme_x = comm->npmenodes_x; + *npme_y = comm->npmenodes_y; + + snew(comm->slb_frac, DIM); + if (comm->eDLB == edlbNO) + { + comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex); + comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey); + comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez); + } + + if (comm->bInterCGBondeds && comm->cutoff_mbody == 0) + { + if (comm->bBondComm || comm->eDLB != edlbNO) + { + /* Set the bonded communication distance to halfway + * the minimum and the maximum, + * since the extra communication cost is nearly zero. + */ + acs = average_cellsize_min(dd, ddbox); + comm->cutoff_mbody = 0.5*(r_bonded + acs); + if (comm->eDLB != edlbNO) + { + /* Check if this does not limit the scaling */ + comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs); + } + if (!comm->bBondComm) + { + /* Without bBondComm do not go beyond the n.b. cut-off */ + comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff); + if (comm->cellsize_limit >= comm->cutoff) + { + /* We don't loose a lot of efficieny + * when increasing it to the n.b. cut-off. + * It can even be slightly faster, because we need + * less checks for the communication setup. + */ + comm->cutoff_mbody = comm->cutoff; + } + } + /* Check if we did not end up below our original limit */ + comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit); + + if (comm->cutoff_mbody > comm->cellsize_limit) + { + comm->cellsize_limit = comm->cutoff_mbody; + } + } + /* Without DLB and cutoff_mbodybBondComm, comm->cellsize_limit); + } + + if (MASTER(cr)) + { + check_dd_restrictions(cr, dd, ir, fplog); + } + + comm->partition_step = INT_MIN; + dd->ddp_count = 0; + + clear_dd_cycle_counts(dd); + + return dd; +} + +static void set_dlb_limits(gmx_domdec_t *dd) + +{ + int d; + + for (d = 0; d < dd->ndim; d++) + { + dd->comm->cd[d].np = dd->comm->cd[d].np_dlb; + dd->comm->cellsize_min[dd->dim[d]] = + dd->comm->cellsize_min_dlb[dd->dim[d]]; + } +} + + +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + real cellsize_min; + int d, nc, i; + char buf[STRLEN]; + + dd = cr->dd; + comm = dd->comm; + + if (fplog) + { + fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100); + } + + cellsize_min = comm->cellsize_min[dd->dim[0]]; + for (d = 1; d < dd->ndim; d++) + { + cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]); + } + + if (cellsize_min < comm->cellsize_limit*1.05) + { + dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n"); + + /* Change DLB from "auto" to "no". */ + comm->eDLB = edlbNO; + + return; + } + + dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n"); + comm->bDynLoadBal = TRUE; + dd->bGridJump = TRUE; + + set_dlb_limits(dd); + + /* We can set the required cell size info here, + * so we do not need to communicate this. + * The grid is completely uniform. + */ + for (d = 0; d < dd->ndim; d++) + { + if (comm->root[d]) + { + comm->load[d].sum_m = comm->load[d].sum; + + nc = dd->nc[dd->dim[d]]; + for (i = 0; i < nc; i++) + { + comm->root[d]->cell_f[i] = i/(real)nc; + if (d > 0) + { + comm->root[d]->cell_f_max0[i] = i /(real)nc; + comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc; + } + } + comm->root[d]->cell_f[nc] = 1.0; + } + } +} + +static char *init_bLocalCG(gmx_mtop_t *mtop) +{ + int ncg, cg; + char *bLocalCG; + + ncg = ncg_mtop(mtop); + snew(bLocalCG, ncg); + for (cg = 0; cg < ncg; cg++) + { + bLocalCG[cg] = FALSE; + } + + return bLocalCG; +} + +void dd_init_bondeds(FILE *fplog, + gmx_domdec_t *dd, gmx_mtop_t *mtop, + gmx_vsite_t *vsite, gmx_constr_t constr, + t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb) +{ + gmx_domdec_comm_t *comm; + gmx_bool bBondComm; + int d; + + dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck); + + comm = dd->comm; + + if (comm->bBondComm) + { + /* Communicate atoms beyond the cut-off for bonded interactions */ + comm = dd->comm; + + comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb); + + comm->bLocalCG = init_bLocalCG(mtop); + } + else + { + /* Only communicate atoms based on cut-off */ + comm->cglink = NULL; + comm->bLocalCG = NULL; + } +} + +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd, + t_inputrec *ir, + gmx_bool bDynLoadBal, real dlb_scale, + gmx_ddbox_t *ddbox) +{ + gmx_domdec_comm_t *comm; + int d; + ivec np; + real limit, shrink; + char buf[64]; + + if (fplog == NULL) + { + return; + } + + comm = dd->comm; + + if (bDynLoadBal) + { + fprintf(fplog, "The maximum number of communication pulses is:"); + for (d = 0; d < dd->ndim; d++) + { + fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb); + } + fprintf(fplog, "\n"); + fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit); + fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale); + fprintf(fplog, "The allowed shrink of domain decomposition cells is:"); + for (d = 0; d < DIM; d++) + { + if (dd->nc[d] > 1) + { + if (d >= ddbox->npbcdim && dd->nc[d] == 2) + { + shrink = 0; + } + else + { + shrink = + comm->cellsize_min_dlb[d]/ + (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]); + } + fprintf(fplog, " %c %.2f", dim2char(d), shrink); + } + } + fprintf(fplog, "\n"); + } + else + { + set_dd_cell_sizes_slb(dd, ddbox, FALSE, np); + fprintf(fplog, "The initial number of communication pulses is:"); + for (d = 0; d < dd->ndim; d++) + { + fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]); + } + fprintf(fplog, "\n"); + fprintf(fplog, "The initial domain decomposition cell size is:"); + for (d = 0; d < DIM; d++) + { + if (dd->nc[d] > 1) + { + fprintf(fplog, " %c %.2f nm", + dim2char(d), dd->comm->cellsize_min[d]); + } + } + fprintf(fplog, "\n\n"); + } + + if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm) + { + fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n"); + fprintf(fplog, "%40s %-7s %6.3f nm\n", + "non-bonded interactions", "", comm->cutoff); + + if (bDynLoadBal) + { + limit = dd->comm->cellsize_limit; + } + else + { + if (dynamic_dd_box(ddbox, ir)) + { + fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n"); + } + limit = dd->comm->cellsize_min[XX]; + for (d = 1; d < DIM; d++) + { + limit = min(limit, dd->comm->cellsize_min[d]); + } + } + + if (comm->bInterCGBondeds) + { + fprintf(fplog, "%40s %-7s %6.3f nm\n", + "two-body bonded interactions", "(-rdd)", + max(comm->cutoff, comm->cutoff_mbody)); + fprintf(fplog, "%40s %-7s %6.3f nm\n", + "multi-body bonded interactions", "(-rdd)", + (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit)); + } + if (dd->vsite_comm) + { + fprintf(fplog, "%40s %-7s %6.3f nm\n", + "virtual site constructions", "(-rcon)", limit); + } + if (dd->constraint_comm) + { + sprintf(buf, "atoms separated by up to %d constraints", + 1+ir->nProjOrder); + fprintf(fplog, "%40s %-7s %6.3f nm\n", + buf, "(-rcon)", limit); + } + fprintf(fplog, "\n"); + } + + fflush(fplog); +} + +static void set_cell_limits_dlb(gmx_domdec_t *dd, + real dlb_scale, + const t_inputrec *ir, + const gmx_ddbox_t *ddbox) +{ + gmx_domdec_comm_t *comm; + int d, dim, npulse, npulse_d_max, npulse_d; + gmx_bool bNoCutOff; + + comm = dd->comm; + + bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0); + + /* Determine the maximum number of comm. pulses in one dimension */ + + comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody); + + /* Determine the maximum required number of grid pulses */ + if (comm->cellsize_limit >= comm->cutoff) + { + /* Only a single pulse is required */ + npulse = 1; + } + else if (!bNoCutOff && comm->cellsize_limit > 0) + { + /* We round down slightly here to avoid overhead due to the latency + * of extra communication calls when the cut-off + * would be only slightly longer than the cell size. + * Later cellsize_limit is redetermined, + * so we can not miss interactions due to this rounding. + */ + npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit); + } + else + { + /* There is no cell size limit */ + npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1)); + } + + if (!bNoCutOff && npulse > 1) + { + /* See if we can do with less pulses, based on dlb_scale */ + npulse_d_max = 0; + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff + /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale)); + npulse_d_max = max(npulse_d_max, npulse_d); + } + npulse = min(npulse, npulse_d_max); + } + + /* This env var can override npulse */ + d = dd_nst_env(debug, "GMX_DD_NPULSE", 0); + if (d > 0) + { + npulse = d; + } + + comm->maxpulse = 1; + comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE); + for (d = 0; d < dd->ndim; d++) + { + comm->cd[d].np_dlb = min(npulse, dd->nc[dd->dim[d]]-1); + comm->cd[d].np_nalloc = comm->cd[d].np_dlb; + snew(comm->cd[d].ind, comm->cd[d].np_nalloc); + comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb); + if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1) + { + comm->bVacDLBNoLimit = FALSE; + } + } + + /* cellsize_limit is set for LINCS in init_domain_decomposition */ + if (!comm->bVacDLBNoLimit) + { + comm->cellsize_limit = max(comm->cellsize_limit, + comm->cutoff/comm->maxpulse); + } + comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody); + /* Set the minimum cell size for each DD dimension */ + for (d = 0; d < dd->ndim; d++) + { + if (comm->bVacDLBNoLimit || + comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff) + { + comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit; + } + else + { + comm->cellsize_min_dlb[dd->dim[d]] = + comm->cutoff/comm->cd[d].np_dlb; + } + } + if (comm->cutoff_mbody <= 0) + { + comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit); + } + if (comm->bDynLoadBal) + { + set_dlb_limits(dd); + } +} + +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC) +{ + /* If each molecule is a single charge group + * or we use domain decomposition for each periodic dimension, + * we do not need to take pbc into account for the bonded interactions. + */ + return (ePBC != epbcNONE && dd->comm->bInterCGBondeds && + !(dd->nc[XX] > 1 && + dd->nc[YY] > 1 && + (dd->nc[ZZ] > 1 || ePBC == epbcXY))); +} + +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale, + t_inputrec *ir, t_forcerec *fr, + gmx_ddbox_t *ddbox) +{ + gmx_domdec_comm_t *comm; + int natoms_tot; + real vol_frac; + + comm = dd->comm; + + /* Initialize the thread data. + * This can not be done in init_domain_decomposition, + * as the numbers of threads is determined later. + */ + comm->nth = gmx_omp_nthreads_get(emntDomdec); + if (comm->nth > 1) + { + snew(comm->dth, comm->nth); + } + + if (EEL_PME(ir->coulombtype)) + { + init_ddpme(dd, &comm->ddpme[0], 0); + if (comm->npmedecompdim >= 2) + { + init_ddpme(dd, &comm->ddpme[1], 1); + } + } + else + { + comm->npmenodes = 0; + if (dd->pme_nodeid >= 0) + { + gmx_fatal_collective(FARGS, NULL, dd, + "Can not have separate PME nodes without PME electrostatics"); + } + } + + if (debug) + { + fprintf(debug, "The DD cut-off is %f\n", comm->cutoff); + } + if (comm->eDLB != edlbNO) + { + set_cell_limits_dlb(dd, dlb_scale, ir, ddbox); + } + + print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox); + if (comm->eDLB == edlbAUTO) + { + if (fplog) + { + fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n"); + } + print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox); + } + + if (ir->ePBC == epbcNONE) + { + vol_frac = 1 - 1/(double)dd->nnodes; + } + else + { + vol_frac = + (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes; + } + if (debug) + { + fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac); + } + natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr]; + + dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot); +} + +static gmx_bool test_dd_cutoff(t_commrec *cr, + t_state *state, t_inputrec *ir, + real cutoff_req) +{ + gmx_domdec_t *dd; + gmx_ddbox_t ddbox; + int d, dim, np; + real inv_cell_size; + int LocallyLimited; + + dd = cr->dd; + + set_ddbox(dd, FALSE, cr, ir, state->box, + TRUE, &dd->comm->cgs_gl, state->x, &ddbox); + + LocallyLimited = 0; + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + + inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim]; + if (dynamic_dd_box(&ddbox, ir)) + { + inv_cell_size *= DD_PRES_SCALE_MARGIN; + } + + np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]); + + if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim && + dd->comm->cd[d].np_dlb > 0) + { + if (np > dd->comm->cd[d].np_dlb) + { + return FALSE; + } + + /* If a current local cell size is smaller than the requested + * cut-off, we could still fix it, but this gets very complicated. + * Without fixing here, we might actually need more checks. + */ + if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req) + { + LocallyLimited = 1; + } + } + } + + if (dd->comm->eDLB != edlbNO) + { + /* If DLB is not active yet, we don't need to check the grid jumps. + * Actually we shouldn't, because then the grid jump data is not set. + */ + if (dd->comm->bDynLoadBal && + check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE)) + { + LocallyLimited = 1; + } + + gmx_sumi(1, &LocallyLimited, cr); + + if (LocallyLimited > 0) + { + return FALSE; + } + } + + return TRUE; +} + +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir, + real cutoff_req) +{ + gmx_bool bCutoffAllowed; + + bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req); + + if (bCutoffAllowed) + { + cr->dd->comm->cutoff = cutoff_req; + } + + return bCutoffAllowed; +} + +void change_dd_dlb_cutoff_limit(t_commrec *cr) +{ + gmx_domdec_comm_t *comm; + + comm = cr->dd->comm; + + /* Turn on the DLB limiting (might have been on already) */ + comm->bPMELoadBalDLBLimits = TRUE; + + /* Change the cut-off limit */ + comm->PMELoadBal_max_cutoff = comm->cutoff; +} + +static void merge_cg_buffers(int ncell, + gmx_domdec_comm_dim_t *cd, int pulse, + int *ncg_cell, + int *index_gl, int *recv_i, + rvec *cg_cm, rvec *recv_vr, + int *cgindex, + cginfo_mb_t *cginfo_mb, int *cginfo) +{ + gmx_domdec_ind_t *ind, *ind_p; + int p, cell, c, cg, cg0, cg1, cg_gl, nat; + int shift, shift_at; + + ind = &cd->ind[pulse]; + + /* First correct the already stored data */ + shift = ind->nrecv[ncell]; + for (cell = ncell-1; cell >= 0; cell--) + { + shift -= ind->nrecv[cell]; + if (shift > 0) + { + /* Move the cg's present from previous grid pulses */ + cg0 = ncg_cell[ncell+cell]; + cg1 = ncg_cell[ncell+cell+1]; + cgindex[cg1+shift] = cgindex[cg1]; + for (cg = cg1-1; cg >= cg0; cg--) + { + index_gl[cg+shift] = index_gl[cg]; + copy_rvec(cg_cm[cg], cg_cm[cg+shift]); + cgindex[cg+shift] = cgindex[cg]; + cginfo[cg+shift] = cginfo[cg]; + } + /* Correct the already stored send indices for the shift */ + for (p = 1; p <= pulse; p++) + { + ind_p = &cd->ind[p]; + cg0 = 0; + for (c = 0; c < cell; c++) + { + cg0 += ind_p->nsend[c]; + } + cg1 = cg0 + ind_p->nsend[cell]; + for (cg = cg0; cg < cg1; cg++) + { + ind_p->index[cg] += shift; + } + } + } + } + + /* Merge in the communicated buffers */ + shift = 0; + shift_at = 0; + cg0 = 0; + for (cell = 0; cell < ncell; cell++) + { + cg1 = ncg_cell[ncell+cell+1] + shift; + if (shift_at > 0) + { + /* Correct the old cg indices */ + for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++) + { + cgindex[cg+1] += shift_at; + } + } + for (cg = 0; cg < ind->nrecv[cell]; cg++) + { + /* Copy this charge group from the buffer */ + index_gl[cg1] = recv_i[cg0]; + copy_rvec(recv_vr[cg0], cg_cm[cg1]); + /* Add it to the cgindex */ + cg_gl = index_gl[cg1]; + cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl); + nat = GET_CGINFO_NATOMS(cginfo[cg1]); + cgindex[cg1+1] = cgindex[cg1] + nat; + cg0++; + cg1++; + shift_at += nat; + } + shift += ind->nrecv[cell]; + ncg_cell[ncell+cell+1] = cg1; + } +} + +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd, + int nzone, int cg0, const int *cgindex) +{ + int cg, zone, p; + + /* Store the atom block boundaries for easy copying of communication buffers + */ + cg = cg0; + for (zone = 0; zone < nzone; zone++) + { + for (p = 0; p < cd->np; p++) + { + cd->ind[p].cell2at0[zone] = cgindex[cg]; + cg += cd->ind[p].nrecv[zone]; + cd->ind[p].cell2at1[zone] = cgindex[cg]; + } + } +} + +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG) +{ + int i; + gmx_bool bMiss; + + bMiss = FALSE; + for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++) + { + if (!bLocalCG[link->a[i]]) + { + bMiss = TRUE; + } + } + + return bMiss; +} + +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */ +typedef struct { + real c[DIM][4]; /* the corners for the non-bonded communication */ + real cr0; /* corner for rounding */ + real cr1[4]; /* corners for rounding */ + real bc[DIM]; /* corners for bounded communication */ + real bcr1; /* corner for rounding for bonded communication */ +} dd_corners_t; + +/* Determine the corners of the domain(s) we are communicating with */ +static void +set_dd_corners(const gmx_domdec_t *dd, + int dim0, int dim1, int dim2, + gmx_bool bDistMB, + dd_corners_t *c) +{ + const gmx_domdec_comm_t *comm; + const gmx_domdec_zones_t *zones; + int i, j; + + comm = dd->comm; + + zones = &comm->zones; + + /* Keep the compiler happy */ + c->cr0 = 0; + c->bcr1 = 0; + + /* The first dimension is equal for all cells */ + c->c[0][0] = comm->cell_x0[dim0]; + if (bDistMB) + { + c->bc[0] = c->c[0][0]; + } + if (dd->ndim >= 2) + { + dim1 = dd->dim[1]; + /* This cell row is only seen from the first row */ + c->c[1][0] = comm->cell_x0[dim1]; + /* All rows can see this row */ + c->c[1][1] = comm->cell_x0[dim1]; + if (dd->bGridJump) + { + c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0); + if (bDistMB) + { + /* For the multi-body distance we need the maximum */ + c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0); + } + } + /* Set the upper-right corner for rounding */ + c->cr0 = comm->cell_x1[dim0]; + + if (dd->ndim >= 3) + { + dim2 = dd->dim[2]; + for (j = 0; j < 4; j++) + { + c->c[2][j] = comm->cell_x0[dim2]; + } + if (dd->bGridJump) + { + /* Use the maximum of the i-cells that see a j-cell */ + for (i = 0; i < zones->nizone; i++) + { + for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++) + { + if (j >= 4) + { + c->c[2][j-4] = + max(c->c[2][j-4], + comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0); + } + } + } + if (bDistMB) + { + /* For the multi-body distance we need the maximum */ + c->bc[2] = comm->cell_x0[dim2]; + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0); + } + } + } + } + + /* Set the upper-right corner for rounding */ + /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1) + * Only cell (0,0,0) can see cell 7 (1,1,1) + */ + c->cr1[0] = comm->cell_x1[dim1]; + c->cr1[3] = comm->cell_x1[dim1]; + if (dd->bGridJump) + { + c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1); + if (bDistMB) + { + /* For the multi-body distance we need the maximum */ + c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1); + } + } + } + } +} + +/* Determine which cg's we need to send in this pulse from this zone */ +static void +get_zone_pulse_cgs(gmx_domdec_t *dd, + int zonei, int zone, + int cg0, int cg1, + const int *index_gl, + const int *cgindex, + int dim, int dim_ind, + int dim0, int dim1, int dim2, + real r_comm2, real r_bcomm2, + matrix box, + ivec tric_dist, + rvec *normal, + real skew_fac2_d, real skew_fac_01, + rvec *v_d, rvec *v_0, rvec *v_1, + const dd_corners_t *c, + rvec sf2_round, + gmx_bool bDistBonded, + gmx_bool bBondComm, + gmx_bool bDist2B, + gmx_bool bDistMB, + rvec *cg_cm, + int *cginfo, + gmx_domdec_ind_t *ind, + int **ibuf, int *ibuf_nalloc, + vec_rvec_t *vbuf, + int *nsend_ptr, + int *nat_ptr, + int *nsend_z_ptr) +{ + gmx_domdec_comm_t *comm; + gmx_bool bScrew; + gmx_bool bDistMB_pulse; + int cg, i; + real r2, rb2, r, tric_sh; + rvec rn, rb; + int dimd; + int nsend_z, nsend, nat; + + comm = dd->comm; + + bScrew = (dd->bScrewPBC && dim == XX); + + bDistMB_pulse = (bDistMB && bDistBonded); + + nsend_z = 0; + nsend = *nsend_ptr; + nat = *nat_ptr; + + for (cg = cg0; cg < cg1; cg++) + { + r2 = 0; + rb2 = 0; + if (tric_dist[dim_ind] == 0) + { + /* Rectangular direction, easy */ + r = cg_cm[cg][dim] - c->c[dim_ind][zone]; + if (r > 0) + { + r2 += r*r; + } + if (bDistMB_pulse) + { + r = cg_cm[cg][dim] - c->bc[dim_ind]; + if (r > 0) + { + rb2 += r*r; + } + } + /* Rounding gives at most a 16% reduction + * in communicated atoms + */ + if (dim_ind >= 1 && (zonei == 1 || zonei == 2)) + { + r = cg_cm[cg][dim0] - c->cr0; + /* This is the first dimension, so always r >= 0 */ + r2 += r*r; + if (bDistMB_pulse) + { + rb2 += r*r; + } + } + if (dim_ind == 2 && (zonei == 2 || zonei == 3)) + { + r = cg_cm[cg][dim1] - c->cr1[zone]; + if (r > 0) + { + r2 += r*r; + } + if (bDistMB_pulse) + { + r = cg_cm[cg][dim1] - c->bcr1; + if (r > 0) + { + rb2 += r*r; + } + } + } + } + else + { + /* Triclinic direction, more complicated */ + clear_rvec(rn); + clear_rvec(rb); + /* Rounding, conservative as the skew_fac multiplication + * will slightly underestimate the distance. + */ + if (dim_ind >= 1 && (zonei == 1 || zonei == 2)) + { + rn[dim0] = cg_cm[cg][dim0] - c->cr0; + for (i = dim0+1; i < DIM; i++) + { + rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0]; + } + r2 = rn[dim0]*rn[dim0]*sf2_round[dim0]; + if (bDistMB_pulse) + { + rb[dim0] = rn[dim0]; + rb2 = r2; + } + /* Take care that the cell planes along dim0 might not + * be orthogonal to those along dim1 and dim2. + */ + for (i = 1; i <= dim_ind; i++) + { + dimd = dd->dim[i]; + if (normal[dim0][dimd] > 0) + { + rn[dimd] -= rn[dim0]*normal[dim0][dimd]; + if (bDistMB_pulse) + { + rb[dimd] -= rb[dim0]*normal[dim0][dimd]; + } + } + } + } + if (dim_ind == 2 && (zonei == 2 || zonei == 3)) + { + rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone]; + tric_sh = 0; + for (i = dim1+1; i < DIM; i++) + { + tric_sh -= cg_cm[cg][i]*v_1[i][dim1]; + } + rn[dim1] += tric_sh; + if (rn[dim1] > 0) + { + r2 += rn[dim1]*rn[dim1]*sf2_round[dim1]; + /* Take care of coupling of the distances + * to the planes along dim0 and dim1 through dim2. + */ + r2 -= rn[dim0]*rn[dim1]*skew_fac_01; + /* Take care that the cell planes along dim1 + * might not be orthogonal to that along dim2. + */ + if (normal[dim1][dim2] > 0) + { + rn[dim2] -= rn[dim1]*normal[dim1][dim2]; + } + } + if (bDistMB_pulse) + { + rb[dim1] += + cg_cm[cg][dim1] - c->bcr1 + tric_sh; + if (rb[dim1] > 0) + { + rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1]; + /* Take care of coupling of the distances + * to the planes along dim0 and dim1 through dim2. + */ + rb2 -= rb[dim0]*rb[dim1]*skew_fac_01; + /* Take care that the cell planes along dim1 + * might not be orthogonal to that along dim2. + */ + if (normal[dim1][dim2] > 0) + { + rb[dim2] -= rb[dim1]*normal[dim1][dim2]; + } + } + } + } + /* The distance along the communication direction */ + rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone]; + tric_sh = 0; + for (i = dim+1; i < DIM; i++) + { + tric_sh -= cg_cm[cg][i]*v_d[i][dim]; + } + rn[dim] += tric_sh; + if (rn[dim] > 0) + { + r2 += rn[dim]*rn[dim]*skew_fac2_d; + /* Take care of coupling of the distances + * to the planes along dim0 and dim1 through dim2. + */ + if (dim_ind == 1 && zonei == 1) + { + r2 -= rn[dim0]*rn[dim]*skew_fac_01; + } + } + if (bDistMB_pulse) + { + clear_rvec(rb); + rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh; + if (rb[dim] > 0) + { + rb2 += rb[dim]*rb[dim]*skew_fac2_d; + /* Take care of coupling of the distances + * to the planes along dim0 and dim1 through dim2. + */ + if (dim_ind == 1 && zonei == 1) + { + rb2 -= rb[dim0]*rb[dim]*skew_fac_01; + } + } + } + } + + if (r2 < r_comm2 || + (bDistBonded && + ((bDistMB && rb2 < r_bcomm2) || + (bDist2B && r2 < r_bcomm2)) && + (!bBondComm || + (GET_CGINFO_BOND_INTER(cginfo[cg]) && + missing_link(comm->cglink, index_gl[cg], + comm->bLocalCG))))) + { + /* Make an index to the local charge groups */ + if (nsend+1 > ind->nalloc) + { + ind->nalloc = over_alloc_large(nsend+1); + srenew(ind->index, ind->nalloc); + } + if (nsend+1 > *ibuf_nalloc) + { + *ibuf_nalloc = over_alloc_large(nsend+1); + srenew(*ibuf, *ibuf_nalloc); + } + ind->index[nsend] = cg; + (*ibuf)[nsend] = index_gl[cg]; + nsend_z++; + vec_rvec_check_alloc(vbuf, nsend+1); + + if (dd->ci[dim] == 0) + { + /* Correct cg_cm for pbc */ + rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]); + if (bScrew) + { + vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY]; + vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ]; + } + } + else + { + copy_rvec(cg_cm[cg], vbuf->v[nsend]); + } + nsend++; + nat += cgindex[cg+1] - cgindex[cg]; + } + } + + *nsend_ptr = nsend; + *nat_ptr = nat; + *nsend_z_ptr = nsend_z; +} + +static void setup_dd_communication(gmx_domdec_t *dd, + matrix box, gmx_ddbox_t *ddbox, + t_forcerec *fr, t_state *state, rvec **f) +{ + int dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot; + int nzone, nzone_send, zone, zonei, cg0, cg1; + int c, i, j, cg, cg_gl, nrcg; + int *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i; + gmx_domdec_comm_t *comm; + gmx_domdec_zones_t *zones; + gmx_domdec_comm_dim_t *cd; + gmx_domdec_ind_t *ind; + cginfo_mb_t *cginfo_mb; + gmx_bool bBondComm, bDist2B, bDistMB, bDistBonded; + real r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg; + dd_corners_t corners; + ivec tric_dist; + rvec *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr; + real skew_fac2_d, skew_fac_01; + rvec sf2_round; + int nsend, nat; + int th; + + if (debug) + { + fprintf(debug, "Setting up DD communication\n"); + } + + comm = dd->comm; + + switch (fr->cutoff_scheme) + { + case ecutsGROUP: + cg_cm = fr->cg_cm; + break; + case ecutsVERLET: + cg_cm = state->x; + break; + default: + gmx_incons("unimplemented"); + cg_cm = NULL; + } + + for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++) + { + dim = dd->dim[dim_ind]; + + /* Check if we need to use triclinic distances */ + tric_dist[dim_ind] = 0; + for (i = 0; i <= dim_ind; i++) + { + if (ddbox->tric_dir[dd->dim[i]]) + { + tric_dist[dim_ind] = 1; + } + } + } + + bBondComm = comm->bBondComm; + + /* Do we need to determine extra distances for multi-body bondeds? */ + bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1); + + /* Do we need to determine extra distances for only two-body bondeds? */ + bDist2B = (bBondComm && !bDistMB); + + r_comm2 = sqr(comm->cutoff); + r_bcomm2 = sqr(comm->cutoff_mbody); + + if (debug) + { + fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2)); + } + + zones = &comm->zones; + + dim0 = dd->dim[0]; + dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1); + dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1); + + set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners); + + /* Triclinic stuff */ + normal = ddbox->normal; + skew_fac_01 = 0; + if (dd->ndim >= 2) + { + v_0 = ddbox->v[dim0]; + if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1]) + { + /* Determine the coupling coefficient for the distances + * to the cell planes along dim0 and dim1 through dim2. + * This is required for correct rounding. + */ + skew_fac_01 = + ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1]; + if (debug) + { + fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01); + } + } + } + if (dd->ndim >= 3) + { + v_1 = ddbox->v[dim1]; + } + + zone_cg_range = zones->cg_range; + index_gl = dd->index_gl; + cgindex = dd->cgindex; + cginfo_mb = fr->cginfo_mb; + + zone_cg_range[0] = 0; + zone_cg_range[1] = dd->ncg_home; + comm->zone_ncg1[0] = dd->ncg_home; + pos_cg = dd->ncg_home; + + nat_tot = dd->nat_home; + nzone = 1; + for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++) + { + dim = dd->dim[dim_ind]; + cd = &comm->cd[dim_ind]; + + if (dim >= ddbox->npbcdim && dd->ci[dim] == 0) + { + /* No pbc in this dimension, the first node should not comm. */ + nzone_send = 0; + } + else + { + nzone_send = nzone; + } + + v_d = ddbox->v[dim]; + skew_fac2_d = sqr(ddbox->skew_fac[dim]); + + cd->bInPlace = TRUE; + for (p = 0; p < cd->np; p++) + { + /* Only atoms communicated in the first pulse are used + * for multi-body bonded interactions or for bBondComm. + */ + bDistBonded = ((bDistMB || bDist2B) && p == 0); + + ind = &cd->ind[p]; + nsend = 0; + nat = 0; + for (zone = 0; zone < nzone_send; zone++) + { + if (tric_dist[dim_ind] && dim_ind > 0) + { + /* Determine slightly more optimized skew_fac's + * for rounding. + * This reduces the number of communicated atoms + * by about 10% for 3D DD of rhombic dodecahedra. + */ + for (dimd = 0; dimd < dim; dimd++) + { + sf2_round[dimd] = 1; + if (ddbox->tric_dir[dimd]) + { + for (i = dd->dim[dimd]+1; i < DIM; i++) + { + /* If we are shifted in dimension i + * and the cell plane is tilted forward + * in dimension i, skip this coupling. + */ + if (!(zones->shift[nzone+zone][i] && + ddbox->v[dimd][i][dimd] >= 0)) + { + sf2_round[dimd] += + sqr(ddbox->v[dimd][i][dimd]); + } + } + sf2_round[dimd] = 1/sf2_round[dimd]; + } + } + } + + zonei = zone_perm[dim_ind][zone]; + if (p == 0) + { + /* Here we permutate the zones to obtain a convenient order + * for neighbor searching + */ + cg0 = zone_cg_range[zonei]; + cg1 = zone_cg_range[zonei+1]; + } + else + { + /* Look only at the cg's received in the previous grid pulse + */ + cg1 = zone_cg_range[nzone+zone+1]; + cg0 = cg1 - cd->ind[p-1].nrecv[zone]; + } + +#pragma omp parallel for num_threads(comm->nth) schedule(static) + for (th = 0; th < comm->nth; th++) + { + gmx_domdec_ind_t *ind_p; + int **ibuf_p, *ibuf_nalloc_p; + vec_rvec_t *vbuf_p; + int *nsend_p, *nat_p; + int *nsend_zone_p; + int cg0_th, cg1_th; + + if (th == 0) + { + /* Thread 0 writes in the comm buffers */ + ind_p = ind; + ibuf_p = &comm->buf_int; + ibuf_nalloc_p = &comm->nalloc_int; + vbuf_p = &comm->vbuf; + nsend_p = &nsend; + nat_p = &nat; + nsend_zone_p = &ind->nsend[zone]; + } + else + { + /* Other threads write into temp buffers */ + ind_p = &comm->dth[th].ind; + ibuf_p = &comm->dth[th].ibuf; + ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc; + vbuf_p = &comm->dth[th].vbuf; + nsend_p = &comm->dth[th].nsend; + nat_p = &comm->dth[th].nat; + nsend_zone_p = &comm->dth[th].nsend_zone; + + comm->dth[th].nsend = 0; + comm->dth[th].nat = 0; + comm->dth[th].nsend_zone = 0; + } + + if (comm->nth == 1) + { + cg0_th = cg0; + cg1_th = cg1; + } + else + { + cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth; + cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth; + } + + /* Get the cg's for this pulse in this zone */ + get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th, + index_gl, cgindex, + dim, dim_ind, dim0, dim1, dim2, + r_comm2, r_bcomm2, + box, tric_dist, + normal, skew_fac2_d, skew_fac_01, + v_d, v_0, v_1, &corners, sf2_round, + bDistBonded, bBondComm, + bDist2B, bDistMB, + cg_cm, fr->cginfo, + ind_p, + ibuf_p, ibuf_nalloc_p, + vbuf_p, + nsend_p, nat_p, + nsend_zone_p); + } + + /* Append data of threads>=1 to the communication buffers */ + for (th = 1; th < comm->nth; th++) + { + dd_comm_setup_work_t *dth; + int i, ns1; + + dth = &comm->dth[th]; + + ns1 = nsend + dth->nsend_zone; + if (ns1 > ind->nalloc) + { + ind->nalloc = over_alloc_dd(ns1); + srenew(ind->index, ind->nalloc); + } + if (ns1 > comm->nalloc_int) + { + comm->nalloc_int = over_alloc_dd(ns1); + srenew(comm->buf_int, comm->nalloc_int); + } + if (ns1 > comm->vbuf.nalloc) + { + comm->vbuf.nalloc = over_alloc_dd(ns1); + srenew(comm->vbuf.v, comm->vbuf.nalloc); + } + + for (i = 0; i < dth->nsend_zone; i++) + { + ind->index[nsend] = dth->ind.index[i]; + comm->buf_int[nsend] = dth->ibuf[i]; + copy_rvec(dth->vbuf.v[i], + comm->vbuf.v[nsend]); + nsend++; + } + nat += dth->nat; + ind->nsend[zone] += dth->nsend_zone; + } + } + /* Clear the counts in case we do not have pbc */ + for (zone = nzone_send; zone < nzone; zone++) + { + ind->nsend[zone] = 0; + } + ind->nsend[nzone] = nsend; + ind->nsend[nzone+1] = nat; + /* Communicate the number of cg's and atoms to receive */ + dd_sendrecv_int(dd, dim_ind, dddirBackward, + ind->nsend, nzone+2, + ind->nrecv, nzone+2); + + /* The rvec buffer is also required for atom buffers of size nsend + * in dd_move_x and dd_move_f. + */ + vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]); + + if (p > 0) + { + /* We can receive in place if only the last zone is not empty */ + for (zone = 0; zone < nzone-1; zone++) + { + if (ind->nrecv[zone] > 0) + { + cd->bInPlace = FALSE; + } + } + if (!cd->bInPlace) + { + /* The int buffer is only required here for the cg indices */ + if (ind->nrecv[nzone] > comm->nalloc_int2) + { + comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]); + srenew(comm->buf_int2, comm->nalloc_int2); + } + /* The rvec buffer is also required for atom buffers + * of size nrecv in dd_move_x and dd_move_f. + */ + i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]); + vec_rvec_check_alloc(&comm->vbuf2, i); + } + } + + /* Make space for the global cg indices */ + if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc + || dd->cg_nalloc == 0) + { + dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]); + srenew(index_gl, dd->cg_nalloc); + srenew(cgindex, dd->cg_nalloc+1); + } + /* Communicate the global cg indices */ + if (cd->bInPlace) + { + recv_i = index_gl + pos_cg; + } + else + { + recv_i = comm->buf_int2; + } + dd_sendrecv_int(dd, dim_ind, dddirBackward, + comm->buf_int, nsend, + recv_i, ind->nrecv[nzone]); + + /* Make space for cg_cm */ + dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]); + if (fr->cutoff_scheme == ecutsGROUP) + { + cg_cm = fr->cg_cm; + } + else + { + cg_cm = state->x; + } + /* Communicate cg_cm */ + if (cd->bInPlace) + { + recv_vr = cg_cm + pos_cg; + } + else + { + recv_vr = comm->vbuf2.v; + } + dd_sendrecv_rvec(dd, dim_ind, dddirBackward, + comm->vbuf.v, nsend, + recv_vr, ind->nrecv[nzone]); + + /* Make the charge group index */ + if (cd->bInPlace) + { + zone = (p == 0 ? 0 : nzone - 1); + while (zone < nzone) + { + for (cg = 0; cg < ind->nrecv[zone]; cg++) + { + cg_gl = index_gl[pos_cg]; + fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl); + nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]); + cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg; + if (bBondComm) + { + /* Update the charge group presence, + * so we can use it in the next pass of the loop. + */ + comm->bLocalCG[cg_gl] = TRUE; + } + pos_cg++; + } + if (p == 0) + { + comm->zone_ncg1[nzone+zone] = ind->nrecv[zone]; + } + zone++; + zone_cg_range[nzone+zone] = pos_cg; + } + } + else + { + /* This part of the code is never executed with bBondComm. */ + merge_cg_buffers(nzone, cd, p, zone_cg_range, + index_gl, recv_i, cg_cm, recv_vr, + cgindex, fr->cginfo_mb, fr->cginfo); + pos_cg += ind->nrecv[nzone]; + } + nat_tot += ind->nrecv[nzone+1]; + } + if (!cd->bInPlace) + { + /* Store the atom block for easy copying of communication buffers */ + make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex); + } + nzone += nzone; + } + dd->index_gl = index_gl; + dd->cgindex = cgindex; + + dd->ncg_tot = zone_cg_range[zones->n]; + dd->nat_tot = nat_tot; + comm->nat[ddnatHOME] = dd->nat_home; + for (i = ddnatZONE; i < ddnatNR; i++) + { + comm->nat[i] = dd->nat_tot; + } + + if (!bBondComm) + { + /* We don't need to update cginfo, since that was alrady done above. + * So we pass NULL for the forcerec. + */ + dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot, + NULL, comm->bLocalCG); + } + + if (debug) + { + fprintf(debug, "Finished setting up DD communication, zones:"); + for (c = 0; c < zones->n; c++) + { + fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]); + } + fprintf(debug, "\n"); + } +} + +static void set_cg_boundaries(gmx_domdec_zones_t *zones) +{ + int c; + + for (c = 0; c < zones->nizone; c++) + { + zones->izone[c].cg1 = zones->cg_range[c+1]; + zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0]; + zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1]; + } +} + +static void set_zones_size(gmx_domdec_t *dd, + matrix box, const gmx_ddbox_t *ddbox, + int zone_start, int zone_end) +{ + gmx_domdec_comm_t *comm; + gmx_domdec_zones_t *zones; + gmx_bool bDistMB; + int z, zi, zj0, zj1, d, dim; + real rcs, rcmbs; + int i, j; + real size_j, add_tric; + real vol; + + comm = dd->comm; + + zones = &comm->zones; + + /* Do we need to determine extra distances for multi-body bondeds? */ + bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1); + + for (z = zone_start; z < zone_end; z++) + { + /* Copy cell limits to zone limits. + * Valid for non-DD dims and non-shifted dims. + */ + copy_rvec(comm->cell_x0, zones->size[z].x0); + copy_rvec(comm->cell_x1, zones->size[z].x1); + } + + for (d = 0; d < dd->ndim; d++) + { + dim = dd->dim[d]; + + for (z = 0; z < zones->n; z++) + { + /* With a staggered grid we have different sizes + * for non-shifted dimensions. + */ + if (dd->bGridJump && zones->shift[z][dim] == 0) + { + if (d == 1) + { + zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0; + zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1; + } + else if (d == 2) + { + zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0; + zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1; + } + } + } + + rcs = comm->cutoff; + rcmbs = comm->cutoff_mbody; + if (ddbox->tric_dir[dim]) + { + rcs /= ddbox->skew_fac[dim]; + rcmbs /= ddbox->skew_fac[dim]; + } + + /* Set the lower limit for the shifted zone dimensions */ + for (z = zone_start; z < zone_end; z++) + { + if (zones->shift[z][dim] > 0) + { + dim = dd->dim[d]; + if (!dd->bGridJump || d == 0) + { + zones->size[z].x0[dim] = comm->cell_x1[dim]; + zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs; + } + else + { + /* Here we take the lower limit of the zone from + * the lowest domain of the zone below. + */ + if (z < 4) + { + zones->size[z].x0[dim] = + comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1; + } + else + { + if (d == 1) + { + zones->size[z].x0[dim] = + zones->size[zone_perm[2][z-4]].x0[dim]; + } + else + { + zones->size[z].x0[dim] = + comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1; + } + } + /* A temporary limit, is updated below */ + zones->size[z].x1[dim] = zones->size[z].x0[dim]; + + if (bDistMB) + { + for (zi = 0; zi < zones->nizone; zi++) + { + if (zones->shift[zi][dim] == 0) + { + /* This takes the whole zone into account. + * With multiple pulses this will lead + * to a larger zone then strictly necessary. + */ + zones->size[z].x1[dim] = max(zones->size[z].x1[dim], + zones->size[zi].x1[dim]+rcmbs); + } + } + } + } + } + } + + /* Loop over the i-zones to set the upper limit of each + * j-zone they see. + */ + for (zi = 0; zi < zones->nizone; zi++) + { + if (zones->shift[zi][dim] == 0) + { + for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++) + { + if (zones->shift[z][dim] > 0) + { + zones->size[z].x1[dim] = max(zones->size[z].x1[dim], + zones->size[zi].x1[dim]+rcs); + } + } + } + } + } + + for (z = zone_start; z < zone_end; z++) + { + /* Initialization only required to keep the compiler happy */ + rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner; + int nc, c; + + /* To determine the bounding box for a zone we need to find + * the extreme corners of 4, 2 or 1 corners. + */ + nc = 1 << (ddbox->npbcdim - 1); + + for (c = 0; c < nc; c++) + { + /* Set up a zone corner at x=0, ignoring trilinic couplings */ + corner[XX] = 0; + if ((c & 1) == 0) + { + corner[YY] = zones->size[z].x0[YY]; + } + else + { + corner[YY] = zones->size[z].x1[YY]; + } + if ((c & 2) == 0) + { + corner[ZZ] = zones->size[z].x0[ZZ]; + } + else + { + corner[ZZ] = zones->size[z].x1[ZZ]; + } + if (dd->ndim == 1 && box[ZZ][YY] != 0) + { + /* With 1D domain decomposition the cg's are not in + * the triclinic box, but triclinic x-y and rectangular y-z. + * Shift y back, so it will later end up at 0. + */ + corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ]; + } + /* Apply the triclinic couplings */ + for (i = YY; i < ddbox->npbcdim; i++) + { + for (j = XX; j < i; j++) + { + corner[j] += corner[i]*box[i][j]/box[i][i]; + } + } + if (c == 0) + { + copy_rvec(corner, corner_min); + copy_rvec(corner, corner_max); + } + else + { + for (i = 0; i < DIM; i++) + { + corner_min[i] = min(corner_min[i], corner[i]); + corner_max[i] = max(corner_max[i], corner[i]); + } + } + } + /* Copy the extreme cornes without offset along x */ + for (i = 0; i < DIM; i++) + { + zones->size[z].bb_x0[i] = corner_min[i]; + zones->size[z].bb_x1[i] = corner_max[i]; + } + /* Add the offset along x */ + zones->size[z].bb_x0[XX] += zones->size[z].x0[XX]; + zones->size[z].bb_x1[XX] += zones->size[z].x1[XX]; + } + + if (zone_start == 0) + { + vol = 1; + for (dim = 0; dim < DIM; dim++) + { + vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim]; + } + zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol; + } + + if (debug) + { + for (z = zone_start; z < zone_end; z++) + { + fprintf(debug, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n", + z, + zones->size[z].x0[XX], zones->size[z].x1[XX], + zones->size[z].x0[YY], zones->size[z].x1[YY], + zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]); + fprintf(debug, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n", + z, + zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX], + zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY], + zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]); + } + } +} + +static int comp_cgsort(const void *a, const void *b) +{ + int comp; + + gmx_cgsort_t *cga, *cgb; + cga = (gmx_cgsort_t *)a; + cgb = (gmx_cgsort_t *)b; + + comp = cga->nsc - cgb->nsc; + if (comp == 0) + { + comp = cga->ind_gl - cgb->ind_gl; + } + + return comp; +} + +static void order_int_cg(int n, const gmx_cgsort_t *sort, + int *a, int *buf) +{ + int i; + + /* Order the data */ + for (i = 0; i < n; i++) + { + buf[i] = a[sort[i].ind]; + } + + /* Copy back to the original array */ + for (i = 0; i < n; i++) + { + a[i] = buf[i]; + } +} + +static void order_vec_cg(int n, const gmx_cgsort_t *sort, + rvec *v, rvec *buf) +{ + int i; + + /* Order the data */ + for (i = 0; i < n; i++) + { + copy_rvec(v[sort[i].ind], buf[i]); + } + + /* Copy back to the original array */ + for (i = 0; i < n; i++) + { + copy_rvec(buf[i], v[i]); + } +} + +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort, + rvec *v, rvec *buf) +{ + int a, atot, cg, cg0, cg1, i; + + if (cgindex == NULL) + { + /* Avoid the useless loop of the atoms within a cg */ + order_vec_cg(ncg, sort, v, buf); + + return; + } + + /* Order the data */ + a = 0; + for (cg = 0; cg < ncg; cg++) + { + cg0 = cgindex[sort[cg].ind]; + cg1 = cgindex[sort[cg].ind+1]; + for (i = cg0; i < cg1; i++) + { + copy_rvec(v[i], buf[a]); + a++; + } + } + atot = a; + + /* Copy back to the original array */ + for (a = 0; a < atot; a++) + { + copy_rvec(buf[a], v[a]); + } +} + +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2, + int nsort_new, gmx_cgsort_t *sort_new, + gmx_cgsort_t *sort1) +{ + int i1, i2, i_new; + + /* The new indices are not very ordered, so we qsort them */ + qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort); + + /* sort2 is already ordered, so now we can merge the two arrays */ + i1 = 0; + i2 = 0; + i_new = 0; + while (i2 < nsort2 || i_new < nsort_new) + { + if (i2 == nsort2) + { + sort1[i1++] = sort_new[i_new++]; + } + else if (i_new == nsort_new) + { + sort1[i1++] = sort2[i2++]; + } + else if (sort2[i2].nsc < sort_new[i_new].nsc || + (sort2[i2].nsc == sort_new[i_new].nsc && + sort2[i2].ind_gl < sort_new[i_new].ind_gl)) + { + sort1[i1++] = sort2[i2++]; + } + else + { + sort1[i1++] = sort_new[i_new++]; + } + } +} + +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old) +{ + gmx_domdec_sort_t *sort; + gmx_cgsort_t *cgsort, *sort_i; + int ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf; + int sort_last, sort_skip; + + sort = dd->comm->sort; + + a = fr->ns.grid->cell_index; + + moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells; + + if (ncg_home_old >= 0) + { + /* The charge groups that remained in the same ns grid cell + * are completely ordered. So we can sort efficiently by sorting + * the charge groups that did move into the stationary list. + */ + ncg_new = 0; + nsort2 = 0; + nsort_new = 0; + for (i = 0; i < dd->ncg_home; i++) + { + /* Check if this cg did not move to another node */ + if (a[i] < moved) + { + if (i >= ncg_home_old || a[i] != sort->sort[i].nsc) + { + /* This cg is new on this node or moved ns grid cell */ + if (nsort_new >= sort->sort_new_nalloc) + { + sort->sort_new_nalloc = over_alloc_dd(nsort_new+1); + srenew(sort->sort_new, sort->sort_new_nalloc); + } + sort_i = &(sort->sort_new[nsort_new++]); + } + else + { + /* This cg did not move */ + sort_i = &(sort->sort2[nsort2++]); + } + /* Sort on the ns grid cell indices + * and the global topology index. + * index_gl is irrelevant with cell ns, + * but we set it here anyhow to avoid a conditional. + */ + sort_i->nsc = a[i]; + sort_i->ind_gl = dd->index_gl[i]; + sort_i->ind = i; + ncg_new++; + } + } + if (debug) + { + fprintf(debug, "ordered sort cgs: stationary %d moved %d\n", + nsort2, nsort_new); + } + /* Sort efficiently */ + ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new, + sort->sort); + } + else + { + cgsort = sort->sort; + ncg_new = 0; + for (i = 0; i < dd->ncg_home; i++) + { + /* Sort on the ns grid cell indices + * and the global topology index + */ + cgsort[i].nsc = a[i]; + cgsort[i].ind_gl = dd->index_gl[i]; + cgsort[i].ind = i; + if (cgsort[i].nsc < moved) + { + ncg_new++; + } + } + if (debug) + { + fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new); + } + /* Determine the order of the charge groups using qsort */ + qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort); + } + + return ncg_new; +} + +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr) +{ + gmx_cgsort_t *sort; + int ncg_new, i, *a, na; + + sort = dd->comm->sort->sort; + + nbnxn_get_atomorder(fr->nbv->nbs, &a, &na); + + ncg_new = 0; + for (i = 0; i < na; i++) + { + if (a[i] >= 0) + { + sort[ncg_new].ind = a[i]; + ncg_new++; + } + } + + return ncg_new; +} + +static void dd_sort_state(gmx_domdec_t *dd, int ePBC, + rvec *cgcm, t_forcerec *fr, t_state *state, + int ncg_home_old) +{ + gmx_domdec_sort_t *sort; + gmx_cgsort_t *cgsort, *sort_i; + int *cgindex; + int ncg_new, i, *ibuf, cgsize; + rvec *vbuf; + + sort = dd->comm->sort; + + if (dd->ncg_home > sort->sort_nalloc) + { + sort->sort_nalloc = over_alloc_dd(dd->ncg_home); + srenew(sort->sort, sort->sort_nalloc); + srenew(sort->sort2, sort->sort_nalloc); + } + cgsort = sort->sort; + + switch (fr->cutoff_scheme) + { + case ecutsGROUP: + ncg_new = dd_sort_order(dd, fr, ncg_home_old); + break; + case ecutsVERLET: + ncg_new = dd_sort_order_nbnxn(dd, fr); + break; + default: + gmx_incons("unimplemented"); + ncg_new = 0; + } + + /* We alloc with the old size, since cgindex is still old */ + vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]); + vbuf = dd->comm->vbuf.v; + + if (dd->comm->bCGs) + { + cgindex = dd->cgindex; + } + else + { + cgindex = NULL; + } + + /* Remove the charge groups which are no longer at home here */ + dd->ncg_home = ncg_new; + if (debug) + { + fprintf(debug, "Set the new home charge group count to %d\n", + dd->ncg_home); + } + + /* Reorder the state */ + for (i = 0; i < estNR; i++) + { + if (EST_DISTR(i) && (state->flags & (1<ncg_home, cgindex, cgsort, state->x, vbuf); + break; + case estV: + order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf); + break; + case estSDX: + order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf); + break; + case estCGP: + order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf); + break; + case estLD_RNG: + case estLD_RNGI: + case estDISRE_INITF: + case estDISRE_RM3TAV: + case estORIRE_INITF: + case estORIRE_DTAV: + /* No ordering required */ + break; + default: + gmx_incons("Unknown state entry encountered in dd_sort_state"); + break; + } + } + } + if (fr->cutoff_scheme == ecutsGROUP) + { + /* Reorder cgcm */ + order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf); + } + + if (dd->ncg_home+1 > sort->ibuf_nalloc) + { + sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1); + srenew(sort->ibuf, sort->ibuf_nalloc); + } + ibuf = sort->ibuf; + /* Reorder the global cg index */ + order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf); + /* Reorder the cginfo */ + order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf); + /* Rebuild the local cg index */ + if (dd->comm->bCGs) + { + ibuf[0] = 0; + for (i = 0; i < dd->ncg_home; i++) + { + cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind]; + ibuf[i+1] = ibuf[i] + cgsize; + } + for (i = 0; i < dd->ncg_home+1; i++) + { + dd->cgindex[i] = ibuf[i]; + } + } + else + { + for (i = 0; i < dd->ncg_home+1; i++) + { + dd->cgindex[i] = i; + } + } + /* Set the home atom number */ + dd->nat_home = dd->cgindex[dd->ncg_home]; + + if (fr->cutoff_scheme == ecutsVERLET) + { + /* The atoms are now exactly in grid order, update the grid order */ + nbnxn_set_atomorder(fr->nbv->nbs); + } + else + { + /* Copy the sorted ns cell indices back to the ns grid struct */ + for (i = 0; i < dd->ncg_home; i++) + { + fr->ns.grid->cell_index[i] = cgsort[i].nsc; + } + fr->ns.grid->nr = dd->ncg_home; + } +} + +static void add_dd_statistics(gmx_domdec_t *dd) +{ + gmx_domdec_comm_t *comm; + int ddnat; + + comm = dd->comm; + + for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++) + { + comm->sum_nat[ddnat-ddnatZONE] += + comm->nat[ddnat] - comm->nat[ddnat-1]; + } + comm->ndecomp++; +} + +void reset_dd_statistics_counters(gmx_domdec_t *dd) +{ + gmx_domdec_comm_t *comm; + int ddnat; + + comm = dd->comm; + + /* Reset all the statistics and counters for total run counting */ + for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++) + { + comm->sum_nat[ddnat-ddnatZONE] = 0; + } + comm->ndecomp = 0; + comm->nload = 0; + comm->load_step = 0; + comm->load_sum = 0; + comm->load_max = 0; + clear_ivec(comm->load_lim); + comm->load_mdf = 0; + comm->load_pme = 0; +} + +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog) +{ + gmx_domdec_comm_t *comm; + int ddnat; + double av; + + comm = cr->dd->comm; + + gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr); + + if (fplog == NULL) + { + return; + } + + fprintf(fplog, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n"); + + for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++) + { + av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp; + switch (ddnat) + { + case ddnatZONE: + fprintf(fplog, + " av. #atoms communicated per step for force: %d x %.1f\n", + 2, av); + break; + case ddnatVSITE: + if (cr->dd->vsite_comm) + { + fprintf(fplog, + " av. #atoms communicated per step for vsites: %d x %.1f\n", + (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2, + av); + } + break; + case ddnatCON: + if (cr->dd->constraint_comm) + { + fprintf(fplog, + " av. #atoms communicated per step for LINCS: %d x %.1f\n", + 1 + ir->nLincsIter, av); + } + break; + default: + gmx_incons(" Unknown type for DD statistics"); + } + } + fprintf(fplog, "\n"); + + if (comm->bRecordLoad && EI_DYNAMICS(ir->eI)) + { + print_dd_load_av(fplog, cr->dd); + } +} + +void dd_partition_system(FILE *fplog, + gmx_large_int_t step, + t_commrec *cr, + gmx_bool bMasterState, + int nstglobalcomm, + t_state *state_global, + gmx_mtop_t *top_global, + t_inputrec *ir, + t_state *state_local, + rvec **f, + t_mdatoms *mdatoms, + gmx_localtop_t *top_local, + t_forcerec *fr, + gmx_vsite_t *vsite, + gmx_shellfc_t shellfc, + gmx_constr_t constr, + t_nrnb *nrnb, + gmx_wallcycle_t wcycle, + gmx_bool bVerbose) +{ + gmx_domdec_t *dd; + gmx_domdec_comm_t *comm; + gmx_ddbox_t ddbox = {0}; + t_block *cgs_gl; + gmx_large_int_t step_pcoupl; + rvec cell_ns_x0, cell_ns_x1; - int i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum; ++ int i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum; + gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad; + gmx_bool bRedist, bSortCG, bResortAll; + ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np; + real grid_density; + char sbuf[22]; + + dd = cr->dd; + comm = dd->comm; + + bBoxChanged = (bMasterState || DEFORM(*ir)); + if (ir->epc != epcNO) + { + /* With nstpcouple > 1 pressure coupling happens. + * one step after calculating the pressure. + * Box scaling happens at the end of the MD step, + * after the DD partitioning. + * We therefore have to do DLB in the first partitioning + * after an MD step where P-coupling occured. + * We need to determine the last step in which p-coupling occurred. + * MRS -- need to validate this for vv? + */ + n = ir->nstpcouple; + if (n == 1) + { + step_pcoupl = step - 1; + } + else + { + step_pcoupl = ((step - 1)/n)*n + 1; + } + if (step_pcoupl >= comm->partition_step) + { + bBoxChanged = TRUE; + } + } + + bNStGlobalComm = (step % nstglobalcomm == 0); + + if (!comm->bDynLoadBal) + { + bDoDLB = FALSE; + } + else + { + /* Should we do dynamic load balacing this step? + * Since it requires (possibly expensive) global communication, + * we might want to do DLB less frequently. + */ + if (bBoxChanged || ir->epc != epcNO) + { + bDoDLB = bBoxChanged; + } + else + { + bDoDLB = bNStGlobalComm; + } + } + + /* Check if we have recorded loads on the nodes */ + if (comm->bRecordLoad && dd_load_count(comm)) + { + if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal) + { + /* Check if we should use DLB at the second partitioning + * and every 100 partitionings, + * so the extra communication cost is negligible. + */ + n = max(100, nstglobalcomm); + bCheckDLB = (comm->n_load_collect == 0 || + comm->n_load_have % n == n-1); + } + else + { + bCheckDLB = FALSE; + } + + /* Print load every nstlog, first and last step to the log file */ + bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || + comm->n_load_collect == 0 || + (ir->nsteps >= 0 && + (step + ir->nstlist > ir->init_step + ir->nsteps))); + + /* Avoid extra communication due to verbose screen output + * when nstglobalcomm is set. + */ + if (bDoDLB || bLogLoad || bCheckDLB || + (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist))) + { + get_load_distribution(dd, wcycle); + if (DDMASTER(dd)) + { + if (bLogLoad) + { + dd_print_load(fplog, dd, step-1); + } + if (bVerbose) + { + dd_print_load_verbose(dd); + } + } + comm->n_load_collect++; + + if (bCheckDLB) + { + /* Since the timings are node dependent, the master decides */ + if (DDMASTER(dd)) + { + bTurnOnDLB = + (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS); + if (debug) + { + fprintf(debug, "step %s, imb loss %f\n", + gmx_step_str(step, sbuf), + dd_force_imb_perf_loss(dd)); + } + } + dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB); + if (bTurnOnDLB) + { + turn_on_dlb(fplog, cr, step); + bDoDLB = TRUE; + } + } + } + comm->n_load_have++; + } + + cgs_gl = &comm->cgs_gl; + + bRedist = FALSE; + if (bMasterState) + { + /* Clear the old state */ + clear_dd_indices(dd, 0, 0); ++ ncgindex_set = 0; + + set_ddbox(dd, bMasterState, cr, ir, state_global->box, + TRUE, cgs_gl, state_global->x, &ddbox); + + get_cg_distribution(fplog, step, dd, cgs_gl, + state_global->box, &ddbox, state_global->x); + + dd_distribute_state(dd, cgs_gl, + state_global, state_local, f); + + dd_make_local_cgs(dd, &top_local->cgs); + + /* Ensure that we have space for the new distribution */ + dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home); + + if (fr->cutoff_scheme == ecutsGROUP) + { + calc_cgcm(fplog, 0, dd->ncg_home, + &top_local->cgs, state_local->x, fr->cg_cm); + } + + inc_nrnb(nrnb, eNR_CGCM, dd->nat_home); + + dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG); - - cg0 = 0; + } + else if (state_local->ddp_count != dd->ddp_count) + { + if (state_local->ddp_count > dd->ddp_count) + { + gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count); + } + + if (state_local->ddp_count_cg_gl != state_local->ddp_count) + { + gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count); + } + + /* Clear the old state */ + clear_dd_indices(dd, 0, 0); + + /* Build the new indices */ + rebuild_cgindex(dd, cgs_gl->index, state_local); + make_dd_indices(dd, cgs_gl->index, 0); ++ ncgindex_set = dd->ncg_home; + + if (fr->cutoff_scheme == ecutsGROUP) + { + /* Redetermine the cg COMs */ + calc_cgcm(fplog, 0, dd->ncg_home, + &top_local->cgs, state_local->x, fr->cg_cm); + } + + inc_nrnb(nrnb, eNR_CGCM, dd->nat_home); + + dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG); + + set_ddbox(dd, bMasterState, cr, ir, state_local->box, + TRUE, &top_local->cgs, state_local->x, &ddbox); + + bRedist = comm->bDynLoadBal; + } + else + { + /* We have the full state, only redistribute the cgs */ + + /* Clear the non-home indices */ + clear_dd_indices(dd, dd->ncg_home, dd->nat_home); ++ ncgindex_set = 0; + + /* Avoid global communication for dim's without pbc and -gcom */ + if (!bNStGlobalComm) + { + copy_rvec(comm->box0, ddbox.box0 ); + copy_rvec(comm->box_size, ddbox.box_size); + } + set_ddbox(dd, bMasterState, cr, ir, state_local->box, + bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox); + + bBoxChanged = TRUE; + bRedist = TRUE; + } + /* For dim's without pbc and -gcom */ + copy_rvec(ddbox.box0, comm->box0 ); + copy_rvec(ddbox.box_size, comm->box_size); + + set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB, + step, wcycle); + + if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0) + { + write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox); + } + + /* Check if we should sort the charge groups */ + if (comm->nstSortCG > 0) + { + bSortCG = (bMasterState || + (bRedist && (step % comm->nstSortCG == 0))); + } + else + { + bSortCG = FALSE; + } + + ncg_home_old = dd->ncg_home; + + ncg_moved = 0; + if (bRedist) + { + wallcycle_sub_start(wcycle, ewcsDD_REDIST); + + dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, + state_local, f, fr, mdatoms, - !bSortCG, nrnb, &cg0, &ncg_moved); ++ !bSortCG, nrnb, &ncgindex_set, &ncg_moved); + + wallcycle_sub_stop(wcycle, ewcsDD_REDIST); + } + + get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box, + dd, &ddbox, + &comm->cell_x0, &comm->cell_x1, + dd->ncg_home, fr->cg_cm, + cell_ns_x0, cell_ns_x1, &grid_density); + + if (bBoxChanged) + { + comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step); + } + + switch (fr->cutoff_scheme) + { + case ecutsGROUP: + copy_ivec(fr->ns.grid->n, ncells_old); + grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC, + state_local->box, cell_ns_x0, cell_ns_x1, + fr->rlistlong, grid_density); + break; + case ecutsVERLET: + nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]); + break; + default: + gmx_incons("unimplemented"); + } + /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */ + copy_ivec(ddbox.tric_dir, comm->tric_dir); + + if (bSortCG) + { + wallcycle_sub_start(wcycle, ewcsDD_GRID); + + /* Sort the state on charge group position. + * This enables exact restarts from this step. + * It also improves performance by about 15% with larger numbers + * of atoms per node. + */ + + /* Fill the ns grid with the home cell, + * so we can sort with the indices. + */ + set_zones_ncg_home(dd); + + switch (fr->cutoff_scheme) + { + case ecutsVERLET: + set_zones_size(dd, state_local->box, &ddbox, 0, 1); + + nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box, + 0, + comm->zones.size[0].bb_x0, + comm->zones.size[0].bb_x1, + 0, dd->ncg_home, + comm->zones.dens_zone0, + fr->cginfo, + state_local->x, + ncg_moved, bRedist ? comm->moved : NULL, + fr->nbv->grp[eintLocal].kernel_type, + fr->nbv->grp[eintLocal].nbat); + + nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]); + break; + case ecutsGROUP: + fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home, + 0, dd->ncg_home, fr->cg_cm); + + copy_ivec(fr->ns.grid->n, ncells_new); + break; + default: + gmx_incons("unimplemented"); + } + + bResortAll = bMasterState; + + /* Check if we can user the old order and ns grid cell indices + * of the charge groups to sort the charge groups efficiently. + */ + if (ncells_new[XX] != ncells_old[XX] || + ncells_new[YY] != ncells_old[YY] || + ncells_new[ZZ] != ncells_old[ZZ]) + { + bResortAll = TRUE; + } + + if (debug) + { + fprintf(debug, "Step %s, sorting the %d home charge groups\n", + gmx_step_str(step, sbuf), dd->ncg_home); + } + dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local, + bResortAll ? -1 : ncg_home_old); + /* Rebuild all the indices */ - cg0 = 0; + ga2la_clear(dd->ga2la); ++ ncgindex_set = 0; + + wallcycle_sub_stop(wcycle, ewcsDD_GRID); + } + + wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM); + + /* Setup up the communication and communicate the coordinates */ + setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f); + + /* Set the indices */ - make_dd_indices(dd, cgs_gl->index, cg0); ++ make_dd_indices(dd, cgs_gl->index, ncgindex_set); + + /* Set the charge group boundaries for neighbor searching */ + set_cg_boundaries(&comm->zones); + + if (fr->cutoff_scheme == ecutsVERLET) + { + set_zones_size(dd, state_local->box, &ddbox, + bSortCG ? 1 : 0, comm->zones.n); + } + + wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM); + + /* + write_dd_pdb("dd_home",step,"dump",top_global,cr, + -1,state_local->x,state_local->box); + */ + + wallcycle_sub_start(wcycle, ewcsDD_MAKETOP); + + /* Extract a local topology from the global topology */ + for (i = 0; i < dd->ndim; i++) + { + np[dd->dim[i]] = comm->cd[i].np; + } + dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box, + comm->cellsize_min, np, + fr, + fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x, + vsite, top_global, top_local); + + wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP); + + wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR); + + /* Set up the special atom communication */ + n = comm->nat[ddnatZONE]; + for (i = ddnatZONE+1; i < ddnatNR; i++) + { + switch (i) + { + case ddnatVSITE: + if (vsite && vsite->n_intercg_vsite) + { + n = dd_make_local_vsites(dd, n, top_local->idef.il); + } + break; + case ddnatCON: + if (dd->bInterCGcons || dd->bInterCGsettles) + { + /* Only for inter-cg constraints we need special code */ + n = dd_make_local_constraints(dd, n, top_global, fr->cginfo, + constr, ir->nProjOrder, + top_local->idef.il); + } + break; + default: + gmx_incons("Unknown special atom type setup"); + } + comm->nat[i] = n; + } + + wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR); + + wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER); + + /* Make space for the extra coordinates for virtual site + * or constraint communication. + */ + state_local->natoms = comm->nat[ddnatNR-1]; + if (state_local->natoms > state_local->nalloc) + { + dd_realloc_state(state_local, f, state_local->natoms); + } + + if (fr->bF_NoVirSum) + { + if (vsite && vsite->n_intercg_vsite) + { + nat_f_novirsum = comm->nat[ddnatVSITE]; + } + else + { + if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0) + { + nat_f_novirsum = dd->nat_tot; + } + else + { + nat_f_novirsum = dd->nat_home; + } + } + } + else + { + nat_f_novirsum = 0; + } + + /* Set the number of atoms required for the force calculation. + * Forces need to be constrained when using a twin-range setup + * or with energy minimization. For simple simulations we could + * avoid some allocation, zeroing and copying, but this is + * probably not worth the complications ande checking. + */ + forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot, + dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum); + + /* We make the all mdatoms up to nat_tot_con. + * We could save some work by only setting invmass + * between nat_tot and nat_tot_con. + */ + /* This call also sets the new number of home particles to dd->nat_home */ + atoms2md(top_global, ir, + comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms); + + /* Now we have the charges we can sort the FE interactions */ + dd_sort_local_top(dd, mdatoms, top_local); + + if (vsite != NULL) + { + /* Now we have updated mdatoms, we can do the last vsite bookkeeping */ + split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite); + } + + if (shellfc) + { + /* Make the local shell stuff, currently no communication is done */ + make_local_shells(cr, mdatoms, shellfc); + } + + if (ir->implicit_solvent) + { + make_local_gb(cr, fr->born, ir->gb_algorithm); + } + + init_bonded_thread_force_reduction(fr, &top_local->idef); + + if (!(cr->duty & DUTY_PME)) + { + /* Send the charges to our PME only node */ + gmx_pme_send_q(cr, mdatoms->nChargePerturbed, + mdatoms->chargeA, mdatoms->chargeB, + dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd)); + } + + if (constr) + { + set_constraints(constr, top_local, ir, mdatoms, cr); + } + + if (ir->ePull != epullNO) + { + /* Update the local pull groups */ + dd_make_local_pull_groups(dd, ir->pull, mdatoms); + } + + if (ir->bRot) + { + /* Update the local rotation groups */ + dd_make_local_rotation_groups(dd, ir->rot); + } + + + add_dd_statistics(dd); + + /* Make sure we only count the cycles for this DD partitioning */ + clear_dd_cycle_counts(dd); + + /* Because the order of the atoms might have changed since + * the last vsite construction, we need to communicate the constructing + * atom coordinates again (for spreading the forces this MD step). + */ + dd_move_x_vsites(dd, state_local->box, state_local->x); + + wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER); + + if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0) + { + dd_move_x(dd, state_local->box, state_local->x); + write_dd_pdb("dd_dump", step, "dump", top_global, cr, + -1, state_local->x, state_local->box); + } + + /* Store the partitioning step */ + comm->partition_step = step; + + /* Increase the DD partitioning counter */ + dd->ddp_count++; + /* The state currently matches this DD partitioning count, store it */ + state_local->ddp_count = dd->ddp_count; + if (bMasterState) + { + /* The DD master node knows the complete cg distribution, + * store the count so we can possibly skip the cg info communication. + */ + comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count); + } + + if (comm->DD_debug > 0) + { + /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */ + check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global), + "after partitioning"); + } +} diff --cc src/programs/mdrun/md.c index 99683e8df2,0000000000..c75b17e76c mode 100644,000000..100644 --- a/src/programs/mdrun/md.c +++ b/src/programs/mdrun/md.c @@@ -1,2236 -1,0 +1,2236 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "typedefs.h" +#include "smalloc.h" +#include "sysstuff.h" +#include "vec.h" +#include "statutil.h" +#include "vcm.h" +#include "mdebin.h" +#include "nrnb.h" +#include "calcmu.h" +#include "index.h" +#include "vsite.h" +#include "update.h" +#include "ns.h" +#include "trnio.h" +#include "xtcio.h" +#include "mdrun.h" +#include "md_support.h" +#include "md_logging.h" +#include "confio.h" +#include "network.h" +#include "pull.h" +#include "xvgr.h" +#include "physics.h" +#include "names.h" +#include "xmdrun.h" +#include "ionize.h" +#include "disre.h" +#include "orires.h" +#include "pme.h" +#include "mdatoms.h" +#include "repl_ex.h" +#include "qmmm.h" +#include "domdec.h" +#include "domdec_network.h" +#include "partdec.h" +#include "topsort.h" +#include "coulomb.h" +#include "constr.h" +#include "shellfc.h" +#include "compute_io.h" +#include "mvdata.h" +#include "checkpoint.h" +#include "mtop_util.h" +#include "sighandler.h" +#include "txtdump.h" +#include "string2.h" +#include "pme_loadbal.h" +#include "bondf.h" +#include "membed.h" +#include "types/nlistheuristics.h" +#include "types/iteratedconstraints.h" +#include "nbnxn_cuda_data_mgmt.h" + +#include "gromacs/utility/gmxmpi.h" + +#ifdef GMX_FAHCORE +#include "corewrap.h" +#endif + +static void reset_all_counters(FILE *fplog, t_commrec *cr, + gmx_large_int_t step, + gmx_large_int_t *step_rel, t_inputrec *ir, + gmx_wallcycle_t wcycle, t_nrnb *nrnb, + gmx_runtime_t *runtime, + nbnxn_cuda_ptr_t cu_nbv) +{ + char sbuf[STEPSTRSIZE]; + + /* Reset all the counters related to performance over the run */ + md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n", + gmx_step_str(step, sbuf)); + + if (cu_nbv) + { + nbnxn_cuda_reset_timings(cu_nbv); + } + + wallcycle_stop(wcycle, ewcRUN); + wallcycle_reset_all(wcycle); + if (DOMAINDECOMP(cr)) + { + reset_dd_statistics_counters(cr->dd); + } + init_nrnb(nrnb); + ir->init_step += *step_rel; + ir->nsteps -= *step_rel; + *step_rel = 0; + wallcycle_start(wcycle, ewcRUN); + runtime_start(runtime); + print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime); +} + +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], + const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact, + int nstglobalcomm, + gmx_vsite_t *vsite, gmx_constr_t constr, + int stepout, t_inputrec *ir, + gmx_mtop_t *top_global, + t_fcdata *fcd, + t_state *state_global, + t_mdatoms *mdatoms, + t_nrnb *nrnb, gmx_wallcycle_t wcycle, + gmx_edsam_t ed, t_forcerec *fr, + int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed, + real cpt_period, real max_hours, + const char *deviceOptions, + unsigned long Flags, + gmx_runtime_t *runtime) +{ + gmx_mdoutf_t *outf; + gmx_large_int_t step, step_rel; + double run_time; + double t, t0, lam0[efptNR]; + gmx_bool bGStatEveryStep, bGStat, bCalcVir, bCalcEner; + gmx_bool bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE, + bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep, + bBornRadii, bStartingFromCpt; + gmx_bool bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE; + gmx_bool do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE, + bForceUpdate = FALSE, bCPT; + int mdof_flags; + gmx_bool bMasterState; + int force_flags, cglo_flags; + tensor force_vir, shake_vir, total_vir, tmp_vir, pres; + int i, m; + t_trxstatus *status; + rvec mu_tot; + t_vcm *vcm; + t_state *bufstate = NULL; + matrix *scale_tot, pcoupl_mu, M, ebox; + gmx_nlheur_t nlh; + t_trxframe rerun_fr; + gmx_repl_ex_t repl_ex = NULL; + int nchkpt = 1; + gmx_localtop_t *top; + t_mdebin *mdebin = NULL; + df_history_t df_history; + t_state *state = NULL; + rvec *f_global = NULL; + int n_xtc = -1; + rvec *x_xtc = NULL; + gmx_enerdata_t *enerd; + rvec *f = NULL; + gmx_global_stat_t gstat; + gmx_update_t upd = NULL; + t_graph *graph = NULL; + globsig_t gs; + gmx_rng_t mcrng = NULL; + gmx_bool bFFscan; + gmx_groups_t *groups; + gmx_ekindata_t *ekind, *ekind_save; + gmx_shellfc_t shellfc; + int count, nconverged = 0; + real timestep = 0; + double tcount = 0; + gmx_bool bIonize = FALSE; + gmx_bool bTCR = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged; + gmx_bool bAppend; + gmx_bool bResetCountersHalfMaxH = FALSE; + gmx_bool bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter; + gmx_bool bUpdateDoLR; + real mu_aver = 0, dvdl_constr; + int a0, a1, gnx = 0, ii; + atom_id *grpindex = NULL; + char *grpname; + t_coupl_rec *tcr = NULL; + rvec *xcopy = NULL, *vcopy = NULL, *cbuf = NULL; + matrix boxcopy = {{0}}, lastbox; + tensor tmpvir; + real fom, oldfom, veta_save, pcurr, scalevir, tracevir; + real vetanew = 0; + int lamnew = 0; + /* for FEP */ + int nstfep; + real rate; + double cycles; + real saved_conserved_quantity = 0; + real last_ekin = 0; + int iter_i; + t_extmass MassQ; + int **trotter_seq; + char sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE]; + int handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/ + gmx_iterate_t iterate; + gmx_large_int_t multisim_nsteps = -1; /* number of steps to do before first multisim + simulation stops. If equal to zero, don't + communicate any more between multisims.*/ + /* PME load balancing data for GPU kernels */ + pme_load_balancing_t pme_loadbal = NULL; + double cycles_pmes; + gmx_bool bPMETuneTry = FALSE, bPMETuneRunning = FALSE; + +#ifdef GMX_FAHCORE + /* Temporary addition for FAHCORE checkpointing */ + int chkpt_ret; +#endif + + /* Check for special mdrun options */ + bRerunMD = (Flags & MD_RERUN); + bIonize = (Flags & MD_IONIZE); + bFFscan = (Flags & MD_FFSCAN); + bAppend = (Flags & MD_APPENDFILES); + if (Flags & MD_RESETCOUNTERSHALFWAY) + { + if (ir->nsteps > 0) + { + /* Signal to reset the counters half the simulation steps. */ + wcycle_set_reset_counters(wcycle, ir->nsteps/2); + } + /* Signal to reset the counters halfway the simulation time. */ + bResetCountersHalfMaxH = (max_hours > 0); + } + + /* md-vv uses averaged full step velocities for T-control + md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control) + md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */ + bVV = EI_VV(ir->eI); + if (bVV) /* to store the initial velocities while computing virial */ + { + snew(cbuf, top_global->natoms); + } + /* all the iteratative cases - only if there are constraints */ + bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD)); + gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to + false in this step. The correct value, true or false, + is set at each step, as it depends on the frequency of temperature + and pressure control.*/ + bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir))); + + if (bRerunMD) + { + /* Since we don't know if the frames read are related in any way, + * rebuild the neighborlist at every step. + */ + ir->nstlist = 1; + ir->nstcalcenergy = 1; + nstglobalcomm = 1; + } + + check_ir_old_tpx_versions(cr, fplog, ir, top_global); + + nstglobalcomm = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir); + bGStatEveryStep = (nstglobalcomm == 1); + + if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL) + { + fprintf(fplog, + "To reduce the energy communication with nstlist = -1\n" + "the neighbor list validity should not be checked at every step,\n" + "this means that exact integration is not guaranteed.\n" + "The neighbor list validity is checked after:\n" + " - 2*std.dev.(n.list life time) steps.\n" + "In most cases this will result in exact integration.\n" + "This reduces the energy communication by a factor of 2 to 3.\n" + "If you want less energy communication, set nstlist > 3.\n\n"); + } + + if (bRerunMD || bFFscan) + { + ir->nstxtcout = 0; + } + groups = &top_global->groups; + + /* Initial values */ + init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda, + &(state_global->fep_state), lam0, + nrnb, top_global, &upd, + nfile, fnm, &outf, &mdebin, + force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags); + + clear_mat(total_vir); + clear_mat(pres); + /* Energy terms and groups */ + snew(enerd, 1); + init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda, + enerd); + if (DOMAINDECOMP(cr)) + { + f = NULL; + } + else + { + snew(f, top_global->natoms); + } + + /* lambda Monte carlo random number generator */ + if (ir->bExpanded) + { + mcrng = gmx_rng_init(ir->expandedvals->lmc_seed); + } + /* copy the state into df_history */ + copy_df_history(&df_history, &state_global->dfhist); + + /* Kinetic energy data */ + snew(ekind, 1); + init_ekindata(fplog, top_global, &(ir->opts), ekind); + /* needed for iteration of constraints */ + snew(ekind_save, 1); + init_ekindata(fplog, top_global, &(ir->opts), ekind_save); + /* Copy the cos acceleration to the groups struct */ + ekind->cosacc.cos_accel = ir->cos_accel; + + gstat = global_stat_init(ir); + debug_gmx(); + + /* Check for polarizable models and flexible constraints */ + shellfc = init_shell_flexcon(fplog, + top_global, n_flexible_constraints(constr), + (ir->bContinuation || + (DOMAINDECOMP(cr) && !MASTER(cr))) ? + NULL : state_global->x); + + if (DEFORM(*ir)) + { +#ifdef GMX_THREAD_MPI + tMPI_Thread_mutex_lock(&deform_init_box_mutex); +#endif + set_deform_reference_box(upd, + deform_init_init_step_tpx, + deform_init_box_tpx); +#ifdef GMX_THREAD_MPI + tMPI_Thread_mutex_unlock(&deform_init_box_mutex); +#endif + } + + { + double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1); + if ((io > 2000) && MASTER(cr)) + { + fprintf(stderr, + "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", + io); + } + } + + if (DOMAINDECOMP(cr)) + { + top = dd_init_local_top(top_global); + + snew(state, 1); + dd_init_local_state(cr->dd, state_global, state); + + if (DDMASTER(cr->dd) && ir->nstfout) + { + snew(f_global, state_global->natoms); + } + } + else + { + if (PAR(cr)) + { + /* Initialize the particle decomposition and split the topology */ + top = split_system(fplog, top_global, ir, cr); + + pd_cg_range(cr, &fr->cg0, &fr->hcg); + pd_at_range(cr, &a0, &a1); + } + else + { + top = gmx_mtop_generate_local_top(top_global, ir); + + a0 = 0; + a1 = top_global->natoms; + } + + forcerec_set_excl_load(fr, top, cr); + + state = partdec_init_local_state(cr, state_global); + f_global = f; + + atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms); + + if (vsite) + { + set_vsite_top(vsite, top, mdatoms, cr); + } + + if (ir->ePBC != epbcNONE && !fr->bMolPBC) + { + graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE); + } + + if (shellfc) + { + make_local_shells(cr, mdatoms, shellfc); + } + + init_bonded_thread_force_reduction(fr, &top->idef); + + if (ir->pull && PAR(cr)) + { + dd_make_local_pull_groups(NULL, ir->pull, mdatoms); + } + } + + if (DOMAINDECOMP(cr)) + { + /* Distribute the charge groups over the nodes from the master node */ + dd_partition_system(fplog, ir->init_step, cr, TRUE, 1, + state_global, top_global, ir, + state, &f, mdatoms, top, fr, + vsite, shellfc, constr, + nrnb, wcycle, FALSE); + + } + + update_mdatoms(mdatoms, state->lambda[efptMASS]); + + if (opt2bSet("-cpi", nfile, fnm)) + { + bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr); + } + else + { + bStateFromCP = FALSE; + } + + if (MASTER(cr)) + { + if (bStateFromCP) + { + /* Update mdebin with energy history if appending to output files */ + if (Flags & MD_APPENDFILES) + { + restore_energyhistory_from_state(mdebin, &state_global->enerhist); + } + else + { + /* We might have read an energy history from checkpoint, + * free the allocated memory and reset the counts. + */ + done_energyhistory(&state_global->enerhist); + init_energyhistory(&state_global->enerhist); + } + } + /* Set the initial energy history in state by updating once */ + update_energyhistory(&state_global->enerhist, mdebin); + } + + if ((state->flags & (1<flags & (1<mols.nr; + snew(grpindex, gnx); + for (i = 0; (i < gnx); i++) + { + grpindex[i] = i; + } + } + + if (repl_ex_nst > 0) + { + /* We need to be sure replica exchange can only occur + * when the energies are current */ + check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy, + "repl_ex_nst", &repl_ex_nst); + /* This check needs to happen before inter-simulation + * signals are initialized, too */ + } + if (repl_ex_nst > 0 && MASTER(cr)) + { + repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir, + repl_ex_nst, repl_ex_nex, repl_ex_seed); + } + + /* PME tuning is only supported with GPUs or PME nodes and not with rerun. + * With perturbed charges with soft-core we should not change the cut-off. + */ + if ((Flags & MD_TUNEPME) && + EEL_PME(fr->eeltype) && + ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) && + !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) && + !bRerunMD) + { + pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata); + cycles_pmes = 0; + if (cr->duty & DUTY_PME) + { + /* Start tuning right away, as we can't measure the load */ + bPMETuneRunning = TRUE; + } + else + { + /* Separate PME nodes, we can measure the PP/PME load balance */ + bPMETuneTry = TRUE; + } + } + + if (!ir->bContinuation && !bRerunMD) + { + if (mdatoms->cFREEZE && (state->flags & (1<start; i < mdatoms->start+mdatoms->homenr; i++) + { + for (m = 0; m < DIM; m++) + { + if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m]) + { + state->v[i][m] = 0; + } + } + } + } + + if (constr) + { + /* Constrain the initial coordinates and velocities */ + do_constrain_first(fplog, constr, ir, mdatoms, state, f, + graph, cr, nrnb, fr, top, shake_vir); + } + if (vsite) + { + /* Construct the virtual sites for the initial configuration */ + construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL, + top->idef.iparams, top->idef.il, + fr->ePBC, fr->bMolPBC, graph, cr, state->box); + } + } + + debug_gmx(); + + /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/ + nstfep = ir->fepvals->nstdhdl; + if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded)) + { + nstfep = ir->expandedvals->nstexpanded; + } + if (repl_ex_nst > 0 && nstfep > repl_ex_nst) + { + nstfep = repl_ex_nst; + } + + /* I'm assuming we need global communication the first time! MRS */ + cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT + | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0) + | (bVV ? CGLO_PRESSURE : 0) + | (bVV ? CGLO_CONSTRAINT : 0) + | (bRerunMD ? CGLO_RERUNMD : 0) + | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0)); + + bSumEkinhOld = FALSE; + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot, + constr, NULL, FALSE, state->box, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags); + if (ir->eI == eiVVAK) + { + /* a second call to get the half step temperature initialized as well */ + /* we do the same call as above, but turn the pressure off -- internally to + compute_globals, this is recognized as a velocity verlet half-step + kinetic energy calculation. This minimized excess variables, but + perhaps loses some logic?*/ + + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot, + constr, NULL, FALSE, state->box, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE)); + } + + /* Calculate the initial half step temperature, and save the ekinh_old */ + if (!(Flags & MD_STARTFROMCPT)) + { + for (i = 0; (i < ir->opts.ngtc); i++) + { + copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old); + } + } + if (ir->eI != eiVV) + { + enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step, + and there is no previous step */ + } + + /* if using an iterative algorithm, we need to create a working directory for the state. */ + if (bIterativeCase) + { + bufstate = init_bufstate(state); + } + if (bFFscan) + { + snew(xcopy, state->natoms); + snew(vcopy, state->natoms); + copy_rvecn(state->x, xcopy, 0, state->natoms); + copy_rvecn(state->v, vcopy, 0, state->natoms); + copy_mat(state->box, boxcopy); + } + + /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter + temperature control */ + trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter); + + if (MASTER(cr)) + { + if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS) + { + fprintf(fplog, + "RMS relative constraint deviation after constraining: %.2e\n", + constr_rmsd(constr, FALSE)); + } + if (EI_STATE_VELOCITY(ir->eI)) + { + fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]); + } + if (bRerunMD) + { + fprintf(stderr, "starting md rerun '%s', reading coordinates from" + " input trajectory '%s'\n\n", + *(top_global->name), opt2fn("-rerun", nfile, fnm)); + if (bVerbose) + { + fprintf(stderr, "Calculated time to finish depends on nsteps from " + "run input file,\nwhich may not correspond to the time " + "needed to process input trajectory.\n\n"); + } + } + else + { + char tbuf[20]; + fprintf(stderr, "starting mdrun '%s'\n", + *(top_global->name)); + if (ir->nsteps >= 0) + { + sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t); + } + else + { + sprintf(tbuf, "%s", "infinite"); + } + if (ir->init_step > 0) + { + fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n", + gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf, + gmx_step_str(ir->init_step, sbuf2), + ir->init_step*ir->delta_t); + } + else + { + fprintf(stderr, "%s steps, %s ps.\n", + gmx_step_str(ir->nsteps, sbuf), tbuf); + } + } + fprintf(fplog, "\n"); + } + + /* Set and write start time */ + runtime_start(runtime); + print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime); + wallcycle_start(wcycle, ewcRUN); + if (fplog) + { + fprintf(fplog, "\n"); + } + + /* safest point to do file checkpointing is here. More general point would be immediately before integrator call */ +#ifdef GMX_FAHCORE + chkpt_ret = fcCheckPointParallel( cr->nodeid, + NULL, 0); + if (chkpt_ret == 0) + { + gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 ); + } +#endif + + debug_gmx(); + /*********************************************************** + * + * Loop over MD steps + * + ************************************************************/ + + /* if rerunMD then read coordinates and velocities from input trajectory */ + if (bRerunMD) + { + if (getenv("GMX_FORCE_UPDATE")) + { + bForceUpdate = TRUE; + } + + rerun_fr.natoms = 0; + if (MASTER(cr)) + { + bNotLastFrame = read_first_frame(oenv, &status, + opt2fn("-rerun", nfile, fnm), + &rerun_fr, TRX_NEED_X | TRX_READ_V); + if (rerun_fr.natoms != top_global->natoms) + { + gmx_fatal(FARGS, + "Number of atoms in trajectory (%d) does not match the " + "run input file (%d)\n", + rerun_fr.natoms, top_global->natoms); + } + if (ir->ePBC != epbcNONE) + { + if (!rerun_fr.bBox) + { + gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time); + } + if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong)) + { + gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time); + } + } + } + + if (PAR(cr)) + { + rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame); + } + + if (ir->ePBC != epbcNONE) + { + /* Set the shift vectors. + * Necessary here when have a static box different from the tpr box. + */ + calc_shifts(rerun_fr.box, fr->shift_vec); + } + } + + /* loop over MD steps or if rerunMD to end of input trajectory */ + bFirstStep = TRUE; + /* Skip the first Nose-Hoover integration when we get the state from tpx */ + bStateFromTPX = !bStateFromCP; + bInitStep = bFirstStep && (bStateFromTPX || bVV); + bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep; + bLastStep = FALSE; + bSumEkinhOld = FALSE; + bExchanged = FALSE; + + init_global_signals(&gs, cr, ir, repl_ex_nst); + + step = ir->init_step; + step_rel = 0; + + if (ir->nstlist == -1) + { + init_nlistheuristics(&nlh, bGStatEveryStep, step); + } + + if (MULTISIM(cr) && (repl_ex_nst <= 0 )) + { + /* check how many steps are left in other sims */ + multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps); + } + + + /* and stop now if we should */ + bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) || + ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps ))); + while (!bLastStep || (bRerunMD && bNotLastFrame)) + { + + wallcycle_start(wcycle, ewcSTEP); + + if (bRerunMD) + { + if (rerun_fr.bStep) + { + step = rerun_fr.step; + step_rel = step - ir->init_step; + } + if (rerun_fr.bTime) + { + t = rerun_fr.time; + } + else + { + t = step; + } + } + else + { + bLastStep = (step_rel == ir->nsteps); + t = t0 + step*ir->delta_t; + } + + if (ir->efep != efepNO || ir->bSimTemp) + { + /* find and set the current lambdas. If rerunning, we either read in a state, or a lambda value, + requiring different logic. */ + + set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0); + bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl); + bDoFEP = (do_per_step(step, nstfep) && (ir->efep != efepNO)); + bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0)); + } + + if (bSimAnn) + { + update_annealing_target_temp(&(ir->opts), t); + } + + if (bRerunMD) + { + if (!(DOMAINDECOMP(cr) && !MASTER(cr))) + { + for (i = 0; i < state_global->natoms; i++) + { + copy_rvec(rerun_fr.x[i], state_global->x[i]); + } + if (rerun_fr.bV) + { + for (i = 0; i < state_global->natoms; i++) + { + copy_rvec(rerun_fr.v[i], state_global->v[i]); + } + } + else + { + for (i = 0; i < state_global->natoms; i++) + { + clear_rvec(state_global->v[i]); + } + if (bRerunWarnNoV) + { + fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n" + " Ekin, temperature and pressure are incorrect,\n" + " the virial will be incorrect when constraints are present.\n" + "\n"); + bRerunWarnNoV = FALSE; + } + } + } + copy_mat(rerun_fr.box, state_global->box); + copy_mat(state_global->box, state->box); + + if (vsite && (Flags & MD_RERUN_VSITE)) + { + if (DOMAINDECOMP(cr)) + { + gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition"); + } + if (graph) + { + /* Following is necessary because the graph may get out of sync + * with the coordinates if we only have every N'th coordinate set + */ + mk_mshift(fplog, graph, fr->ePBC, state->box, state->x); + shift_self(graph, state->box, state->x); + } + construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v, + top->idef.iparams, top->idef.il, + fr->ePBC, fr->bMolPBC, graph, cr, state->box); + if (graph) + { + unshift_self(graph, state->box, state->x); + } + } + } + + /* Stop Center of Mass motion */ + bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm)); + + /* Copy back starting coordinates in case we're doing a forcefield scan */ + if (bFFscan) + { + for (ii = 0; (ii < state->natoms); ii++) + { + copy_rvec(xcopy[ii], state->x[ii]); + copy_rvec(vcopy[ii], state->v[ii]); + } + copy_mat(boxcopy, state->box); + } + + if (bRerunMD) + { + /* for rerun MD always do Neighbour Searching */ + bNS = (bFirstStep || ir->nstlist != 0); + bNStList = bNS; + } + else + { + /* Determine whether or not to do Neighbour Searching and LR */ + bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0); + + bNS = (bFirstStep || bExchanged || bNStList || bDoFEP || + (ir->nstlist == -1 && nlh.nabnsb > 0)); + + if (bNS && ir->nstlist == -1) + { + set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step); + } + } + + /* check whether we should stop because another simulation has + stopped. */ + if (MULTISIM(cr)) + { + if ( (multisim_nsteps >= 0) && (step_rel >= multisim_nsteps) && + (multisim_nsteps != ir->nsteps) ) + { + if (bNS) + { + if (MASTER(cr)) + { + fprintf(stderr, + "Stopping simulation %d because another one has finished\n", + cr->ms->sim); + } + bLastStep = TRUE; + gs.sig[eglsCHKPT] = 1; + } + } + } + + /* < 0 means stop at next step, > 0 means stop at next NS step */ - if ( (gs.set[eglsSTOPCOND] < 0 ) || - ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist == 0)) ) ++ if ( (gs.set[eglsSTOPCOND] < 0) || ++ ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) ) + { + bLastStep = TRUE; + } + + /* Determine whether or not to update the Born radii if doing GB */ + bBornRadii = bFirstStep; + if (ir->implicit_solvent && (step % ir->nstgbradii == 0)) + { + bBornRadii = TRUE; + } + + do_log = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep; + do_verbose = bVerbose && + (step % stepout == 0 || bFirstStep || bLastStep); + + if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD)) + { + if (bRerunMD) + { + bMasterState = TRUE; + } + else + { + bMasterState = FALSE; + /* Correct the new box if it is too skewed */ + if (DYNAMIC_BOX(*ir)) + { + if (correct_box(fplog, step, state->box, graph)) + { + bMasterState = TRUE; + } + } + if (DOMAINDECOMP(cr) && bMasterState) + { + dd_collect_state(cr->dd, state, state_global); + } + } + + if (DOMAINDECOMP(cr)) + { + /* Repartition the domain decomposition */ + wallcycle_start(wcycle, ewcDOMDEC); + dd_partition_system(fplog, step, cr, + bMasterState, nstglobalcomm, + state_global, top_global, ir, + state, &f, mdatoms, top, fr, + vsite, shellfc, constr, + nrnb, wcycle, + do_verbose && !bPMETuneRunning); + wallcycle_stop(wcycle, ewcDOMDEC); + /* If using an iterative integrator, reallocate space to match the decomposition */ + } + } + + if (MASTER(cr) && do_log && !bFFscan) + { + print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */ + } + + if (ir->efep != efepNO) + { + update_mdatoms(mdatoms, state->lambda[efptMASS]); + } + + if ((bRerunMD && rerun_fr.bV) || bExchanged) + { + + /* We need the kinetic energy at minus the half step for determining + * the full step kinetic energy and possibly for T-coupling.*/ + /* This may not be quite working correctly yet . . . . */ + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot, + constr, NULL, FALSE, state->box, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE); + } + clear_mat(force_vir); + + /* Ionize the atoms if necessary */ + if (bIonize) + { + ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v, + mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr); + } + + /* Update force field in ffscan program */ + if (bFFscan) + { + if (update_forcefield(fplog, + nfile, fnm, fr, + mdatoms->nr, state->x, state->box)) + { + gmx_finalize_par(); + + exit(0); + } + } + + /* We write a checkpoint at this MD step when: + * either at an NS step when we signalled through gs, + * or at the last step (but not when we do not want confout), + * but never at the first step or with rerun. + */ + bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) || + (bLastStep && (Flags & MD_CONFOUT))) && + step > ir->init_step && !bRerunMD); + if (bCPT) + { + gs.set[eglsCHKPT] = 0; + } + + /* Determine the energy and pressure: + * at nstcalcenergy steps and at energy output steps (set below). + */ + if (EI_VV(ir->eI) && (!bInitStep)) + { + /* for vv, the first half of the integration actually corresponds + to the previous step. bCalcEner is only required to be evaluated on the 'next' step, + but the virial needs to be calculated on both the current step and the 'next' step. Future + reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */ + + bCalcEner = do_per_step(step-1, ir->nstcalcenergy); + bCalcVir = bCalcEner || + (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple))); + } + else + { + bCalcEner = do_per_step(step, ir->nstcalcenergy); + bCalcVir = bCalcEner || + (ir->epc != epcNO && do_per_step(step, ir->nstpcouple)); + } + + /* Do we need global communication ? */ + bGStat = (bCalcVir || bCalcEner || bStopCM || + do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) || + (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck)); + + do_ene = (do_per_step(step, ir->nstenergy) || bLastStep); + + if (do_ene || do_log) + { + bCalcVir = TRUE; + bCalcEner = TRUE; + bGStat = TRUE; + } + + /* these CGLO_ options remain the same throughout the iteration */ + cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) | + (bGStat ? CGLO_GSTAT : 0) + ); + + force_flags = (GMX_FORCE_STATECHANGED | + ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) | + GMX_FORCE_ALLFORCES | + GMX_FORCE_SEPLRF | + (bCalcVir ? GMX_FORCE_VIRIAL : 0) | + (bCalcEner ? GMX_FORCE_ENERGY : 0) | + (bDoFEP ? GMX_FORCE_DHDL : 0) + ); + + if (fr->bTwinRange) + { + if (do_per_step(step, ir->nstcalclr)) + { + force_flags |= GMX_FORCE_DO_LR; + } + } + + if (shellfc) + { + /* Now is the time to relax the shells */ + count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step, + ir, bNS, force_flags, + bStopCM, top, top_global, + constr, enerd, fcd, + state, f, force_vir, mdatoms, + nrnb, wcycle, graph, groups, + shellfc, fr, bBornRadii, t, mu_tot, + state->natoms, &bConverged, vsite, + outf->fp_field); + tcount += count; + + if (bConverged) + { + nconverged++; + } + } + else + { + /* The coordinates (x) are shifted (to get whole molecules) + * in do_force. + * This is parallellized as well, and does communication too. + * Check comments in sim_util.c + */ + do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups, + state->box, state->x, &state->hist, + f, force_vir, mdatoms, enerd, fcd, + state->lambda, graph, + fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii, + (bNS ? GMX_FORCE_NS : 0) | force_flags); + } + + if (bTCR) + { + mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA, + mu_tot, &top_global->mols, mdatoms, gnx, grpindex); + } + + if (bTCR && bFirstStep) + { + tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef)); + fprintf(fplog, "Done init_coupling\n"); + fflush(fplog); + } + + if (bVV && !bStartingFromCpt && !bRerunMD) + /* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */ + { + if (ir->eI == eiVV && bInitStep) + { + /* if using velocity verlet with full time step Ekin, + * take the first half step only to compute the + * virial for the first step. From there, + * revert back to the initial coordinates + * so that the input is actually the initial step. + */ + copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */ + } + else + { + /* this is for NHC in the Ekin(t+dt/2) version of vv */ + trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1); + } + + /* If we are using twin-range interactions where the long-range component + * is only evaluated every nstcalclr>1 steps, we should do a special update + * step to combine the long-range forces on these steps. + * For nstcalclr=1 this is not done, since the forces would have been added + * directly to the short-range forces already. + */ + bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr)); + + update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, + f, bUpdateDoLR, fr->f_twin, fcd, + ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1, + cr, nrnb, constr, &top->idef); + + if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep) + { + gmx_iterate_init(&iterate, TRUE); + } + /* for iterations, we save these vectors, as we will be self-consistently iterating + the calculations */ + + /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */ + + /* save the state */ + if (iterate.bIterationActive) + { + copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts)); + } + + bFirstIterate = TRUE; + while (bFirstIterate || iterate.bIterationActive) + { + if (iterate.bIterationActive) + { + copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts)); + if (bFirstIterate && bTrotter) + { + /* The first time through, we need a decent first estimate + of veta(t+dt) to compute the constraints. Do + this by computing the box volume part of the + trotter integration at this time. Nothing else + should be changed by this routine here. If + !(first time), we start with the previous value + of veta. */ + + veta_save = state->veta; + trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0); + vetanew = state->veta; + state->veta = veta_save; + } + } + + bOK = TRUE; + if (!bRerunMD || rerun_fr.bV || bForceUpdate) /* Why is rerun_fr.bV here? Unclear. */ + { + update_constraints(fplog, step, NULL, ir, ekind, mdatoms, + state, fr->bMolPBC, graph, f, + &top->idef, shake_vir, NULL, + cr, nrnb, wcycle, upd, constr, + bInitStep, TRUE, bCalcVir, vetanew); + + if (!bOK && !bFFscan) + { + gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains"); + } + + } + else if (graph) + { + /* Need to unshift here if a do_force has been + called in the previous step */ + unshift_self(graph, state->box, state->x); + } + + /* if VV, compute the pressure and constraints */ + /* For VV2, we strictly only need this if using pressure + * control, but we really would like to have accurate pressures + * printed out. + * Think about ways around this in the future? + * For now, keep this choice in comments. + */ + /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */ + /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/ + bPres = TRUE; + bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK)); + if (bCalcEner && ir->eI == eiVVAK) /*MRS: 7/9/2010 -- this still doesn't fix it?*/ + { + bSumEkinhOld = TRUE; + } + /* for vv, the first half of the integration actually corresponds to the previous step. + So we need information from the last step in the first half of the integration */ + if (bGStat || do_per_step(step-1, nstglobalcomm)) + { + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot, + constr, NULL, FALSE, state->box, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + cglo_flags + | CGLO_ENERGY + | (bTemp ? CGLO_TEMPERATURE : 0) + | (bPres ? CGLO_PRESSURE : 0) + | (bPres ? CGLO_CONSTRAINT : 0) + | ((iterate.bIterationActive) ? CGLO_ITERATE : 0) + | (bFirstIterate ? CGLO_FIRSTITERATE : 0) + | CGLO_SCALEEKIN + ); + /* explanation of above: + a) We compute Ekin at the full time step + if 1) we are using the AveVel Ekin, and it's not the + initial step, or 2) if we are using AveEkin, but need the full + time step kinetic energy for the pressure (always true now, since we want accurate statistics). + b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in + EkinAveVel because it's needed for the pressure */ + } + /* temperature scaling and pressure scaling to produce the extended variables at t+dt */ + if (!bInitStep) + { + if (bTrotter) + { + m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */ + trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2); + } + else + { + if (bExchanged) + { + + /* We need the kinetic energy at minus the half step for determining + * the full step kinetic energy and possibly for T-coupling.*/ + /* This may not be quite working correctly yet . . . . */ + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot, + constr, NULL, FALSE, state->box, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE); + } + } + } + + if (iterate.bIterationActive && + done_iterating(cr, fplog, step, &iterate, bFirstIterate, + state->veta, &vetanew)) + { + break; + } + bFirstIterate = FALSE; + } + + if (bTrotter && !bInitStep) + { + copy_mat(shake_vir, state->svir_prev); + copy_mat(force_vir, state->fvir_prev); + if (IR_NVT_TROTTER(ir) && ir->eI == eiVV) + { + /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */ + enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE); + enerd->term[F_EKIN] = trace(ekind->ekin); + } + } + /* if it's the initial step, we performed this first step just to get the constraint virial */ + if (bInitStep && ir->eI == eiVV) + { + copy_rvecn(cbuf, state->v, 0, state->natoms); + } + } + + /* MRS -- now done iterating -- compute the conserved quantity */ + if (bVV) + { + saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ); + if (ir->eI == eiVV) + { + last_ekin = enerd->term[F_EKIN]; + } + if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) + { + saved_conserved_quantity -= enerd->term[F_DISPCORR]; + } + /* sum up the foreign energy and dhdl terms for vv. currently done every step so that dhdl is correct in the .edr */ + if (!bRerunMD) + { + sum_dhdl(enerd, state->lambda, ir->fepvals); + } + } + + /* ######## END FIRST UPDATE STEP ############## */ + /* ######## If doing VV, we now have v(dt) ###### */ + if (bDoExpanded) + { + /* perform extended ensemble sampling in lambda - we don't + actually move to the new state before outputting + statistics, but if performing simulated tempering, we + do update the velocities and the tau_t. */ + + lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms); + } + /* ################## START TRAJECTORY OUTPUT ################# */ + + /* Now we have the energies and forces corresponding to the + * coordinates at time t. We must output all of this before + * the update. + * for RerunMD t is read from input trajectory + */ + mdof_flags = 0; + if (do_per_step(step, ir->nstxout)) + { + mdof_flags |= MDOF_X; + } + if (do_per_step(step, ir->nstvout)) + { + mdof_flags |= MDOF_V; + } + if (do_per_step(step, ir->nstfout)) + { + mdof_flags |= MDOF_F; + } + if (do_per_step(step, ir->nstxtcout)) + { + mdof_flags |= MDOF_XTC; + } + if (bCPT) + { + mdof_flags |= MDOF_CPT; + } + ; + +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP) + if (bLastStep) + { + /* Enforce writing positions and velocities at end of run */ + mdof_flags |= (MDOF_X | MDOF_V); + } +#endif +#ifdef GMX_FAHCORE + if (MASTER(cr)) + { + fcReportProgress( ir->nsteps, step ); + } + + /* sync bCPT and fc record-keeping */ + if (bCPT && MASTER(cr)) + { + fcRequestCheckPoint(); + } +#endif + + if (mdof_flags != 0) + { + wallcycle_start(wcycle, ewcTRAJ); + if (bCPT) + { + if (state->flags & (1<flags & (1<ekinstate.bUpToDate = FALSE; + } + else + { + update_ekinstate(&state_global->ekinstate, ekind); + state_global->ekinstate.bUpToDate = TRUE; + } + update_energyhistory(&state_global->enerhist, mdebin); + if (ir->efep != efepNO || ir->bSimTemp) + { + state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be + structured so this isn't necessary. + Note this reassignment is only necessary + for single threads.*/ + copy_df_history(&state_global->dfhist, &df_history); + } + } + } + write_traj(fplog, cr, outf, mdof_flags, top_global, + step, t, state, state_global, f, f_global, &n_xtc, &x_xtc); + if (bCPT) + { + nchkpt++; + bCPT = FALSE; + } + debug_gmx(); + if (bLastStep && step_rel == ir->nsteps && + (Flags & MD_CONFOUT) && MASTER(cr) && + !bRerunMD && !bFFscan) + { + /* x and v have been collected in write_traj, + * because a checkpoint file will always be written + * at the last step. + */ + fprintf(stderr, "\nWriting final coordinates.\n"); + if (fr->bMolPBC) + { + /* Make molecules whole only for confout writing */ + do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x); + } + write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm), + *top_global->name, top_global, + state_global->x, state_global->v, + ir->ePBC, state->box); + debug_gmx(); + } + wallcycle_stop(wcycle, ewcTRAJ); + } + + /* kludge -- virial is lost with restart for NPT control. Must restart */ + if (bStartingFromCpt && bVV) + { + copy_mat(state->svir_prev, shake_vir); + copy_mat(state->fvir_prev, force_vir); + } + /* ################## END TRAJECTORY OUTPUT ################ */ + + /* Determine the wallclock run time up till now */ + run_time = gmx_gettime() - (double)runtime->real; + + /* Check whether everything is still allright */ + if (((int)gmx_get_stop_condition() > handled_stop_condition) +#ifdef GMX_THREAD_MPI + && MASTER(cr) +#endif + ) + { + /* this is just make gs.sig compatible with the hack + of sending signals around by MPI_Reduce with together with + other floats */ + if (gmx_get_stop_condition() == gmx_stop_cond_next_ns) + { + gs.sig[eglsSTOPCOND] = 1; + } + if (gmx_get_stop_condition() == gmx_stop_cond_next) + { + gs.sig[eglsSTOPCOND] = -1; + } + /* < 0 means stop at next step, > 0 means stop at next NS step */ + if (fplog) + { + fprintf(fplog, + "\n\nReceived the %s signal, stopping at the next %sstep\n\n", + gmx_get_signal_name(), + gs.sig[eglsSTOPCOND] == 1 ? "NS " : ""); + fflush(fplog); + } + fprintf(stderr, + "\n\nReceived the %s signal, stopping at the next %sstep\n\n", + gmx_get_signal_name(), + gs.sig[eglsSTOPCOND] == 1 ? "NS " : ""); + fflush(stderr); + handled_stop_condition = (int)gmx_get_stop_condition(); + } + else if (MASTER(cr) && (bNS || ir->nstlist <= 0) && + (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) && + gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0) + { + /* Signal to terminate the run */ + gs.sig[eglsSTOPCOND] = 1; + if (fplog) + { + fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99); + } + fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99); + } + + if (bResetCountersHalfMaxH && MASTER(cr) && + run_time > max_hours*60.0*60.0*0.495) + { + gs.sig[eglsRESETCOUNTERS] = 1; + } + + if (ir->nstlist == -1 && !bRerunMD) + { + /* When bGStatEveryStep=FALSE, global_stat is only called + * when we check the atom displacements, not at NS steps. + * This means that also the bonded interaction count check is not + * performed immediately after NS. Therefore a few MD steps could + * be performed with missing interactions. + * But wrong energies are never written to file, + * since energies are only written after global_stat + * has been called. + */ + if (step >= nlh.step_nscheck) + { + nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs, + nlh.scale_tot, state->x); + } + else + { + /* This is not necessarily true, + * but step_nscheck is determined quite conservatively. + */ + nlh.nabnsb = 0; + } + } + + /* In parallel we only have to check for checkpointing in steps + * where we do global communication, + * otherwise the other nodes don't know. + */ + if (MASTER(cr) && ((bGStat || !PAR(cr)) && + cpt_period >= 0 && + (cpt_period == 0 || + run_time >= nchkpt*cpt_period*60.0)) && + gs.set[eglsCHKPT] == 0) + { + gs.sig[eglsCHKPT] = 1; + } + + /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */ + if (EI_VV(ir->eI)) + { + if (!bInitStep) + { + update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms); + } + if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */ + { + gmx_bool bIfRandomize; + bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr); + /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */ + if (constr && bIfRandomize) + { + update_constraints(fplog, step, NULL, ir, ekind, mdatoms, + state, fr->bMolPBC, graph, f, + &top->idef, tmp_vir, NULL, + cr, nrnb, wcycle, upd, constr, + bInitStep, TRUE, bCalcVir, vetanew); + } + } + } + + if (bIterativeCase && do_per_step(step, ir->nstpcouple)) + { + gmx_iterate_init(&iterate, TRUE); + /* for iterations, we save these vectors, as we will be redoing the calculations */ + copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts)); + } + + bFirstIterate = TRUE; + while (bFirstIterate || iterate.bIterationActive) + { + /* We now restore these vectors to redo the calculation with improved extended variables */ + if (iterate.bIterationActive) + { + copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts)); + } + + /* We make the decision to break or not -after- the calculation of Ekin and Pressure, + so scroll down for that logic */ + + /* ######### START SECOND UPDATE STEP ################# */ + /* Box is changed in update() when we do pressure coupling, + * but we should still use the old box for energy corrections and when + * writing it to the energy file, so it matches the trajectory files for + * the same timestep above. Make a copy in a separate array. + */ + copy_mat(state->box, lastbox); + + bOK = TRUE; + dvdl_constr = 0; + + if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate)) + { + wallcycle_start(wcycle, ewcUPDATE); + /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */ + if (bTrotter) + { + if (iterate.bIterationActive) + { + if (bFirstIterate) + { + scalevir = 1; + } + else + { + /* we use a new value of scalevir to converge the iterations faster */ + scalevir = tracevir/trace(shake_vir); + } + msmul(shake_vir, scalevir, shake_vir); + m_add(force_vir, shake_vir, total_vir); + clear_mat(shake_vir); + } + trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3); + /* We can only do Berendsen coupling after we have summed + * the kinetic energy or virial. Since the happens + * in global_state after update, we should only do it at + * step % nstlist = 1 with bGStatEveryStep=FALSE. + */ + } + else + { + update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms); + update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle, + upd, bInitStep); + } + + if (bVV) + { + bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr)); + + /* velocity half-step update */ + update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f, + bUpdateDoLR, fr->f_twin, fcd, + ekind, M, wcycle, upd, FALSE, etrtVELOCITY2, + cr, nrnb, constr, &top->idef); + } + + /* Above, initialize just copies ekinh into ekin, + * it doesn't copy position (for VV), + * and entire integrator for MD. + */ + + if (ir->eI == eiVVAK) + { + copy_rvecn(state->x, cbuf, 0, state->natoms); + } + bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr)); + + update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f, + bUpdateDoLR, fr->f_twin, fcd, + ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef); + wallcycle_stop(wcycle, ewcUPDATE); + + update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state, + fr->bMolPBC, graph, f, + &top->idef, shake_vir, force_vir, + cr, nrnb, wcycle, upd, constr, + bInitStep, FALSE, bCalcVir, state->veta); + + if (ir->eI == eiVVAK) + { + /* erase F_EKIN and F_TEMP here? */ + /* just compute the kinetic energy at the half step to perform a trotter step */ + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot, + constr, NULL, FALSE, lastbox, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + cglo_flags | CGLO_TEMPERATURE + ); + wallcycle_start(wcycle, ewcUPDATE); + trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4); + /* now we know the scaling, we can compute the positions again again */ + copy_rvecn(cbuf, state->x, 0, state->natoms); + + bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr)); + + update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f, + bUpdateDoLR, fr->f_twin, fcd, + ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef); + wallcycle_stop(wcycle, ewcUPDATE); + + /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */ + /* are the small terms in the shake_vir here due + * to numerical errors, or are they important + * physically? I'm thinking they are just errors, but not completely sure. + * For now, will call without actually constraining, constr=NULL*/ + update_constraints(fplog, step, NULL, ir, ekind, mdatoms, + state, fr->bMolPBC, graph, f, + &top->idef, tmp_vir, force_vir, + cr, nrnb, wcycle, upd, NULL, + bInitStep, FALSE, bCalcVir, + state->veta); + } + if (!bOK && !bFFscan) + { + gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains"); + } + + if (fr->bSepDVDL && fplog && do_log) + { + fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr); + } + if (bVV) + { + /* this factor or 2 correction is necessary + because half of the constraint force is removed + in the vv step, so we have to double it. See + the Redmine issue #1255. It is not yet clear + if the factor of 2 is exact, or just a very + good approximation, and this will be + investigated. The next step is to see if this + can be done adding a dhdl contribution from the + rattle step, but this is somewhat more + complicated with the current code. Will be + investigated, hopefully for 4.6.3. However, + this current solution is much better than + having it completely wrong. + */ + enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr; + } + else + { + enerd->term[F_DVDL_CONSTR] += dvdl_constr; + } + } + else if (graph) + { + /* Need to unshift here */ + unshift_self(graph, state->box, state->x); + } + + if (vsite != NULL) + { + wallcycle_start(wcycle, ewcVSITECONSTR); + if (graph != NULL) + { + shift_self(graph, state->box, state->x); + } + construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v, + top->idef.iparams, top->idef.il, + fr->ePBC, fr->bMolPBC, graph, cr, state->box); + + if (graph != NULL) + { + unshift_self(graph, state->box, state->x); + } + wallcycle_stop(wcycle, ewcVSITECONSTR); + } + + /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */ + /* With Leap-Frog we can skip compute_globals at + * non-communication steps, but we need to calculate + * the kinetic energy one step before communication. + */ + if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm))) + { + if (ir->nstlist == -1 && bFirstIterate) + { + gs.sig[eglsNABNSB] = nlh.nabnsb; + } + compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm, + wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot, + constr, + bFirstIterate ? &gs : NULL, + (step_rel % gs.nstms == 0) && + (multisim_nsteps < 0 || (step_rel < multisim_nsteps)), + lastbox, + top_global, &pcurr, top_global->natoms, &bSumEkinhOld, + cglo_flags + | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0) + | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0) + | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) + | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) + | (iterate.bIterationActive ? CGLO_ITERATE : 0) + | (bFirstIterate ? CGLO_FIRSTITERATE : 0) + | CGLO_CONSTRAINT + ); + if (ir->nstlist == -1 && bFirstIterate) + { + nlh.nabnsb = gs.set[eglsNABNSB]; + gs.set[eglsNABNSB] = 0; + } + } + /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */ + /* ############# END CALC EKIN AND PRESSURE ################# */ + + /* Note: this is OK, but there are some numerical precision issues with using the convergence of + the virial that should probably be addressed eventually. state->veta has better properies, + but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could + generate the new shake_vir, but test the veta value for convergence. This will take some thought. */ + + if (iterate.bIterationActive && + done_iterating(cr, fplog, step, &iterate, bFirstIterate, + trace(shake_vir), &tracevir)) + { + break; + } + bFirstIterate = FALSE; + } + + if (!bVV || bRerunMD) + { + /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */ + sum_dhdl(enerd, state->lambda, ir->fepvals); + } + update_box(fplog, step, ir, mdatoms, state, graph, f, + ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE); + + /* ################# END UPDATE STEP 2 ################# */ + /* #### We now have r(t+dt) and v(t+dt/2) ############# */ + + /* The coordinates (x) were unshifted in update */ + if (bFFscan && (shellfc == NULL || bConverged)) + { + if (print_forcefield(fplog, enerd->term, mdatoms->homenr, + f, NULL, xcopy, + &(top_global->mols), mdatoms->massT, pres)) + { + gmx_finalize_par(); + + fprintf(stderr, "\n"); + exit(0); + } + } + if (!bGStat) + { + /* We will not sum ekinh_old, + * so signal that we still have to do it. + */ + bSumEkinhOld = TRUE; + } + + if (bTCR) + { + /* Only do GCT when the relaxation of shells (minimization) has converged, + * otherwise we might be coupling to bogus energies. + * In parallel we must always do this, because the other sims might + * update the FF. + */ + + /* Since this is called with the new coordinates state->x, I assume + * we want the new box state->box too. / EL 20040121 + */ + do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr, + ir, MASTER(cr), + mdatoms, &(top->idef), mu_aver, + top_global->mols.nr, cr, + state->box, total_vir, pres, + mu_tot, state->x, f, bConverged); + debug_gmx(); + } + + /* ######### BEGIN PREPARING EDR OUTPUT ########### */ + + /* use the directly determined last velocity, not actually the averaged half steps */ + if (bTrotter && ir->eI == eiVV) + { + enerd->term[F_EKIN] = last_ekin; + } + enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN]; + + if (bVV) + { + enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity; + } + else + { + enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ); + } + /* Check for excessively large energies */ + if (bIonize) + { +#ifdef GMX_DOUBLE + real etot_max = 1e200; +#else + real etot_max = 1e30; +#endif + if (fabs(enerd->term[F_ETOT]) > etot_max) + { + fprintf(stderr, "Energy too large (%g), giving up\n", + enerd->term[F_ETOT]); + } + } + /* ######### END PREPARING EDR OUTPUT ########### */ + + /* Time for performance */ + if (((step % stepout) == 0) || bLastStep) + { + runtime_upd_proc(runtime); + } + + /* Output stuff */ + if (MASTER(cr)) + { + gmx_bool do_dr, do_or; + + if (fplog && do_log && bDoExpanded) + { + /* only needed if doing expanded ensemble */ + PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL, + &df_history, state->fep_state, ir->nstlog, step); + } + if (!(bStartingFromCpt && (EI_VV(ir->eI)))) + { + if (bCalcEner) + { + upd_mdebin(mdebin, bDoDHDL, TRUE, + t, mdatoms->tmass, enerd, state, + ir->fepvals, ir->expandedvals, lastbox, + shake_vir, force_vir, total_vir, pres, + ekind, mu_tot, constr); + } + else + { + upd_mdebin_step(mdebin); + } + + do_dr = do_per_step(step, ir->nstdisreout); + do_or = do_per_step(step, ir->nstorireout); + + print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL, + step, t, + eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts)); + } + if (ir->ePull != epullNO) + { + pull_print_output(ir->pull, step, t); + } + + if (do_per_step(step, ir->nstlog)) + { + if (fflush(fplog) != 0) + { + gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?"); + } + } + } + if (bDoExpanded) + { + /* Have to do this part after outputting the logfile and the edr file */ + state->fep_state = lamnew; + for (i = 0; i < efptNR; i++) + { + state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew]; + } + } + /* Remaining runtime */ + if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning) + { + if (shellfc) + { + fprintf(stderr, "\n"); + } + print_time(stderr, runtime, step, ir, cr); + } + + /* Replica exchange */ + bExchanged = FALSE; + if ((repl_ex_nst > 0) && (step > 0) && !bLastStep && + do_per_step(step, repl_ex_nst)) + { + bExchanged = replica_exchange(fplog, cr, repl_ex, + state_global, enerd, + state, step, t); + + if (bExchanged && DOMAINDECOMP(cr)) + { + dd_partition_system(fplog, step, cr, TRUE, 1, + state_global, top_global, ir, + state, &f, mdatoms, top, fr, + vsite, shellfc, constr, + nrnb, wcycle, FALSE); + } + } + + bFirstStep = FALSE; + bInitStep = FALSE; + bStartingFromCpt = FALSE; + + /* ####### SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */ + /* With all integrators, except VV, we need to retain the pressure + * at the current step for coupling at the next step. + */ + if ((state->flags & (1<nstpcouple > 0 && step % ir->nstpcouple == 0))) + { + /* Store the pressure in t_state for pressure coupling + * at the next MD step. + */ + copy_mat(pres, state->pres_prev); + } + + /* ####### END SET VARIABLES FOR NEXT ITERATION ###### */ + + if ( (membed != NULL) && (!bLastStep) ) + { + rescale_membed(step_rel, membed, state_global->x); + } + + if (bRerunMD) + { + if (MASTER(cr)) + { + /* read next frame from input trajectory */ + bNotLastFrame = read_next_frame(oenv, status, &rerun_fr); + } + + if (PAR(cr)) + { + rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame); + } + } + + if (!bRerunMD || !rerun_fr.bStep) + { + /* increase the MD step number */ + step++; + step_rel++; + } + + cycles = wallcycle_stop(wcycle, ewcSTEP); + if (DOMAINDECOMP(cr) && wcycle) + { + dd_cycles_add(cr->dd, cycles, ddCyclStep); + } + + if (bPMETuneRunning || bPMETuneTry) + { + /* PME grid + cut-off optimization with GPUs or PME nodes */ + + /* Count the total cycles over the last steps */ + cycles_pmes += cycles; + + /* We can only switch cut-off at NS steps */ + if (step % ir->nstlist == 0) + { + /* PME grid + cut-off optimization with GPUs or PME nodes */ + if (bPMETuneTry) + { + if (DDMASTER(cr->dd)) + { + /* PME node load is too high, start tuning */ + bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05); + } + dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning); + + if (bPMETuneRunning || step_rel > ir->nstlist*50) + { + bPMETuneTry = FALSE; + } + } + if (bPMETuneRunning) + { + /* init_step might not be a multiple of nstlist, + * but the first cycle is always skipped anyhow. + */ + bPMETuneRunning = + pme_load_balance(pme_loadbal, cr, + (bVerbose && MASTER(cr)) ? stderr : NULL, + fplog, + ir, state, cycles_pmes, + fr->ic, fr->nbv, &fr->pmedata, + step); + + /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ + fr->ewaldcoeff = fr->ic->ewaldcoeff; + fr->rlist = fr->ic->rlist; + fr->rlistlong = fr->ic->rlistlong; + fr->rcoulomb = fr->ic->rcoulomb; + fr->rvdw = fr->ic->rvdw; + } + cycles_pmes = 0; + } + } + + if (step_rel == wcycle_get_reset_counters(wcycle) || + gs.set[eglsRESETCOUNTERS] != 0) + { + /* Reset all the counters related to performance over the run */ + reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime, + fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL); + wcycle_set_reset_counters(wcycle, -1); + if (!(cr->duty & DUTY_PME)) + { + /* Tell our PME node to reset its counters */ + gmx_pme_send_resetcounters(cr, step); + } + /* Correct max_hours for the elapsed time */ + max_hours -= run_time/(60.0*60.0); + bResetCountersHalfMaxH = FALSE; + gs.set[eglsRESETCOUNTERS] = 0; + } + + } + /* End of main MD loop */ + debug_gmx(); + + /* Stop the time */ + runtime_end(runtime); + + if (bRerunMD && MASTER(cr)) + { + close_trj(status); + } + + if (!(cr->duty & DUTY_PME)) + { + /* Tell the PME only node to finish */ + gmx_pme_send_finish(cr); + } + + if (MASTER(cr)) + { + if (ir->nstcalcenergy > 0 && !bRerunMD) + { + print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t, + eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts)); + } + } + + done_mdoutf(outf); + + debug_gmx(); + + if (ir->nstlist == -1 && nlh.nns > 0 && fplog) + { + fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns))); + fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns); + } + + if (pme_loadbal != NULL) + { + pme_loadbal_done(pme_loadbal, cr, fplog, + fr->nbv != NULL && fr->nbv->bUseGPU); + } + + if (shellfc && fplog) + { + fprintf(fplog, "Fraction of iterations that converged: %.2f %%\n", + (nconverged*100.0)/step_rel); + fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n", + tcount/step_rel); + } + + if (repl_ex_nst > 0 && MASTER(cr)) + { + print_replica_exchange_statistics(fplog, repl_ex); + } + + runtime->nsteps_done = step_rel; + + return 0; +}