src/gromacs/mdlib/csettle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include <assert.h>
  40 #include <stdio.h>
  41
  42 #include <cmath>
  43
  44 #include <algorithm>
  45
  46 #include "gromacs/math/functions.h"
  47 #include "gromacs/math/invertmatrix.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/mdlib/constr.h"
  50 #include "gromacs/mdtypes/mdatom.h"
  51 #include "gromacs/pbcutil/ishift.h"
  52 #include "gromacs/pbcutil/pbc.h"
  53 #include "gromacs/pbcutil/pbc-simd.h"
  54 #include "gromacs/simd/simd.h"
  55 #include "gromacs/simd/simd_math.h"
  56 #include "gromacs/topology/mtop_util.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/smalloc.h"
  60
  61 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  62
  63 typedef struct
  64 {
  65     real   mO;
  66     real   mH;
  67     real   wh;
  68     real   dOH;
  69     real   dHH;
  70     real   ra;
  71     real   rb;
  72     real   rc;
  73     real   irc2;
  74     /* For projection */
  75     real   imO;
  76     real   imH;
  77     real   invdOH;
  78     real   invdHH;
  79     matrix invmat;
  80 } settleparam_t;
  81
  82 typedef struct gmx_settledata
  83 {
  84     settleparam_t massw;    /* Parameters for SETTLE for coordinates */
  85     settleparam_t mass1;    /* Parameters with all masses 1, for forces */
  86
  87     int           nsettle;  /* The number of settles on our rank */
  88     int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
  89     int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
  90     int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
  91     real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
  92     int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
  93
  94     bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
  95 } t_gmx_settledata;
  96
  97
  98 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH,
  99                              matrix inverseCouplingMatrix)
 100 {
 101     /* We normalize the inverse masses with invmO for the matrix inversion.
 102      * so we can keep using masses of almost zero for frozen particles,
 103      * without running out of the float range in invertMatrix.
 104      */
 105     double invmORelative = 1.0;
 106     double invmHRelative = invmH/static_cast<double>(invmO);
 107     double distanceRatio = dHH/static_cast<double>(dOH);
 108
 109     /* Construct the constraint coupling matrix */
 110     matrix mat;
 111     mat[0][0] = invmORelative + invmHRelative;
 112     mat[0][1] = invmORelative*(1.0 - 0.5*gmx::square(distanceRatio));
 113     mat[0][2] = invmHRelative*0.5*distanceRatio;
 114     mat[1][1] = mat[0][0];
 115     mat[1][2] = mat[0][2];
 116     mat[2][2] = invmHRelative + invmHRelative;
 117     mat[1][0] = mat[0][1];
 118     mat[2][0] = mat[0][2];
 119     mat[2][1] = mat[1][2];
 120
 121     invertMatrix(mat, inverseCouplingMatrix);
 122
 123     msmul(inverseCouplingMatrix, 1/invmO, inverseCouplingMatrix);
 124 }
 125
 126 static void settleparam_init(settleparam_t *p,
 127                              real mO, real mH, real invmO, real invmH,
 128                              real dOH, real dHH)
 129 {
 130     /* We calculate parameters in double precision to minimize errors.
 131      * The velocity correction applied during SETTLE coordinate constraining
 132      * introduces a systematic error of approximately 1 bit per atom,
 133      * depending on what the compiler does with the code.
 134      */
 135     double wohh;
 136
 137     p->mO     = mO;
 138     p->mH     = mH;
 139     wohh      = mO + 2.0*mH;
 140     p->wh     = mH/wohh;
 141     p->dOH    = dOH;
 142     p->dHH    = dHH;
 143     double rc = dHH/2.0;
 144     double ra = 2.0*mH*std::sqrt(dOH*dOH - rc*rc)/wohh;
 145     p->rb     = std::sqrt(dOH*dOH - rc*rc) - ra;
 146     p->rc     = rc;
 147     p->ra     = ra;
 148     p->irc2   = 1.0/dHH;
 149
 150     /* For projection: inverse masses and coupling matrix inversion */
 151     p->imO    = invmO;
 152     p->imH    = invmH;
 153
 154     p->invdOH = 1.0/dOH;
 155     p->invdHH = 1.0/dHH;
 156
 157     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
 158
 159     if (debug)
 160     {
 161         fprintf(debug, "wh =%g, rc = %g, ra = %g\n",
 162                 p->wh, p->rc, p->ra);
 163         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
 164                 p->rb, p->irc2, p->dHH, p->dOH);
 165     }
 166 }
 167
 168 gmx_settledata_t settle_init(const gmx_mtop_t *mtop)
 169 {
 170     /* Check that we have only one settle type */
 171     int                   settle_type = -1;
 172     gmx_mtop_ilistloop_t  iloop       = gmx_mtop_ilistloop_init(mtop);
 173     t_ilist              *ilist;
 174     int                   nmol;
 175     const int             nral1       = 1 + NRAL(F_SETTLE);
 176     while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
 177     {
 178         for (int i = 0; i < ilist[F_SETTLE].nr; i += nral1)
 179         {
 180             if (settle_type == -1)
 181             {
 182                 settle_type = ilist[F_SETTLE].iatoms[i];
 183             }
 184             else if (ilist[F_SETTLE].iatoms[i] != settle_type)
 185             {
 186                 gmx_fatal(FARGS,
 187                           "The [molecules] section of your topology specifies more than one block of\n"
 188                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
 189                           "If you are trying to partition your solvent into different *groups*\n"
 190                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
 191                           "files specify groups. Otherwise, you may wish to change the least-used\n"
 192                           "block of molecules with SETTLE constraints into 3 normal constraints.");
 193             }
 194         }
 195     }
 196     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
 197
 198     gmx_settledata_t settled;
 199
 200     snew(settled, 1);
 201
 202     /* We will not initialize the normal SETTLE parameters here yet,
 203      * since the atom (inv)masses can depend on the integrator and
 204      * free-energy perturbation. We set mO=-1 to trigger later initialization.
 205      */
 206     settled->massw.mO = -1;
 207
 208     real dOH = mtop->ffparams.iparams[settle_type].settle.doh;
 209     real dHH = mtop->ffparams.iparams[settle_type].settle.dhh;
 210     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
 211
 212     settled->ow1    = nullptr;
 213     settled->hw2    = nullptr;
 214     settled->hw3    = nullptr;
 215     settled->virfac = nullptr;
 216     settled->nalloc = 0;
 217
 218     /* Without SIMD configured, this bool is not used */
 219     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr);
 220
 221     return settled;
 222 }
 223
 224 void settle_free(gmx_settledata_t settled)
 225 {
 226     sfree_aligned(settled->ow1);
 227     sfree_aligned(settled->hw2);
 228     sfree_aligned(settled->hw3);
 229     sfree_aligned(settled->virfac);
 230     sfree(settled);
 231 }
 232
 233 void settle_set_constraints(gmx_settledata_t  settled,
 234                             const t_ilist    *il_settle,
 235                             const t_mdatoms  *mdatoms)
 236 {
 237 #if GMX_SIMD_HAVE_REAL
 238     const int pack_size = GMX_SIMD_REAL_WIDTH;
 239 #else
 240     const int pack_size = 1;
 241 #endif
 242
 243     const int nral1     = 1 + NRAL(F_SETTLE);
 244     int       nsettle   = il_settle->nr/nral1;
 245     settled->nsettle = nsettle;
 246
 247     if (nsettle > 0)
 248     {
 249         const t_iatom *iatoms = il_settle->iatoms;
 250
 251         /* Here we initialize the normal SETTLE parameters */
 252         if (settled->massw.mO < 0)
 253         {
 254             int firstO = iatoms[1];
 255             int firstH = iatoms[2];
 256             settleparam_init(&settled->massw,
 257                              mdatoms->massT[firstO],
 258                              mdatoms->massT[firstH],
 259                              mdatoms->invmass[firstO],
 260                              mdatoms->invmass[firstH],
 261                              settled->mass1.dOH,
 262                              settled->mass1.dHH);
 263         }
 264
 265         if (nsettle + pack_size > settled->nalloc)
 266         {
 267             settled->nalloc = over_alloc_dd(nsettle + pack_size);
 268             sfree_aligned(settled->ow1);
 269             sfree_aligned(settled->hw2);
 270             sfree_aligned(settled->hw3);
 271             sfree_aligned(settled->virfac);
 272             snew_aligned(settled->ow1, settled->nalloc, 64);
 273             snew_aligned(settled->hw2, settled->nalloc, 64);
 274             snew_aligned(settled->hw3, settled->nalloc, 64);
 275             snew_aligned(settled->virfac, settled->nalloc, 64);
 276         }
 277
 278         for (int i = 0; i < nsettle; i++)
 279         {
 280             settled->ow1[i]    = iatoms[i*nral1 + 1];
 281             settled->hw2[i]    = iatoms[i*nral1 + 2];
 282             settled->hw3[i]    = iatoms[i*nral1 + 3];
 283             /* We should avoid double counting of virial contributions for
 284              * SETTLEs that appear in multiple DD domains, so we only count
 285              * the contribution on the home range of the oxygen atom.
 286              */
 287             settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms->homenr ? 1 : 0);
 288         }
 289
 290         /* Pack the index array to the full SIMD width with copies from
 291          * the last normal entry, but with no virial contribution.
 292          */
 293         int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
 294         for (int i = nsettle; i < end_packed; i++)
 295         {
 296             settled->ow1[i]    = settled->ow1[nsettle - 1];
 297             settled->hw2[i]    = settled->hw2[nsettle - 1];
 298             settled->hw3[i]    = settled->hw3[nsettle - 1];
 299             settled->virfac[i] = 0;
 300         }
 301     }
 302 }
 303
 304 void settle_proj(gmx_settledata_t settled, int econq,
 305                  int nsettle, t_iatom iatoms[],
 306                  const t_pbc *pbc,
 307                  rvec x[],
 308                  rvec *der, rvec *derp,
 309                  int calcvir_atom_end, tensor vir_r_m_dder)
 310 {
 311     /* Settle for projection out constraint components
 312      * of derivatives of the coordinates.
 313      * Berk Hess 2008-1-10
 314      */
 315
 316     settleparam_t *p;
 317     real           imO, imH, dOH, dHH, invdOH, invdHH;
 318     matrix         invmat;
 319     int            i, m, m2, ow1, hw2, hw3;
 320     rvec           roh2, roh3, rhh, dc, fc;
 321
 322     calcvir_atom_end *= DIM;
 323
 324     if (econq == econqForce)
 325     {
 326         p = &settled->mass1;
 327     }
 328     else
 329     {
 330         p = &settled->massw;
 331     }
 332     imO    = p->imO;
 333     imH    = p->imH;
 334     copy_mat(p->invmat, invmat);
 335     dOH    = p->dOH;
 336     dHH    = p->dHH;
 337     invdOH = p->invdOH;
 338     invdHH = p->invdHH;
 339
 340 #ifdef PRAGMAS
 341 #pragma ivdep
 342 #endif
 343
 344     const int nral1 = 1 + NRAL(F_SETTLE);
 345
 346     for (i = 0; i < nsettle; i++)
 347     {
 348         ow1 = iatoms[i*nral1 + 1];
 349         hw2 = iatoms[i*nral1 + 2];
 350         hw3 = iatoms[i*nral1 + 3];
 351
 352         if (pbc == nullptr)
 353         {
 354             rvec_sub(x[ow1], x[hw2], roh2);
 355             rvec_sub(x[ow1], x[hw3], roh3);
 356             rvec_sub(x[hw2], x[hw3], rhh);
 357         }
 358         else
 359         {
 360             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
 361             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
 362             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
 363         }
 364         svmul(invdOH, roh2, roh2);
 365         svmul(invdOH, roh3, roh3);
 366         svmul(invdHH, rhh, rhh);
 367         /* 18 flops */
 368
 369         /* Determine the projections of der on the bonds */
 370         clear_rvec(dc);
 371         for (m = 0; m < DIM; m++)
 372         {
 373             dc[0] += (der[ow1][m] - der[hw2][m])*roh2[m];
 374             dc[1] += (der[ow1][m] - der[hw3][m])*roh3[m];
 375             dc[2] += (der[hw2][m] - der[hw3][m])*rhh [m];
 376         }
 377         /* 27 flops */
 378
 379         /* Determine the correction for the three bonds */
 380         mvmul(invmat, dc, fc);
 381         /* 15 flops */
 382
 383         /* Subtract the corrections from derp */
 384         for (m = 0; m < DIM; m++)
 385         {
 386             derp[ow1][m] -= imO*( fc[0]*roh2[m] + fc[1]*roh3[m]);
 387             derp[hw2][m] -= imH*(-fc[0]*roh2[m] + fc[2]*rhh [m]);
 388             derp[hw3][m] -= imH*(-fc[1]*roh3[m] - fc[2]*rhh [m]);
 389         }
 390
 391         /* 45 flops */
 392
 393         if (ow1 < calcvir_atom_end)
 394         {
 395             /* Determining r \dot m der is easy,
 396              * since fc contains the mass weighted corrections for der.
 397              */
 398
 399             for (m = 0; m < DIM; m++)
 400             {
 401                 for (m2 = 0; m2 < DIM; m2++)
 402                 {
 403                     vir_r_m_dder[m][m2] +=
 404                         dOH*roh2[m]*roh2[m2]*fc[0] +
 405                         dOH*roh3[m]*roh3[m2]*fc[1] +
 406                         dHH*rhh [m]*rhh [m2]*fc[2];
 407                 }
 408             }
 409         }
 410     }
 411 }
 412
 413
 414 /* The actual settle code, templated for real/SimdReal and for optimization */
 415 template<typename T, typename TypeBool, int packSize,
 416          typename TypePbc,
 417          bool bCorrectVelocity,
 418          bool bCalcVirial>
 419 static void settleTemplate(const gmx_settledata_t settled,
 420                            int settleStart, int settleEnd,
 421                            const TypePbc pbc,
 422                            const real *x, real *xprime,
 423                            real invdt, real * gmx_restrict v,
 424                            tensor vir_r_m_dr,
 425                            bool *bErrorHasOccurred)
 426 {
 427     /* ******************************************************************* */
 428     /*                                                                  ** */
 429     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
 430     /*                                                                  ** */
 431     /*    Algorithm changes by Berk Hess:                               ** */
 432     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
 433     /*    2006-10-16 Changed velocity update to use differences         ** */
 434     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
 435     /*    2016-02    Complete rewrite of the code for SIMD              ** */
 436     /*                                                                  ** */
 437     /*    Reference for the SETTLE algorithm                            ** */
 438     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
 439     /*                                                                  ** */
 440     /* ******************************************************************* */
 441
 442     assert(settleStart % packSize == 0);
 443     assert(settleEnd   % packSize == 0);
 444
 445     TypeBool       bError = TypeBool(false);
 446
 447     settleparam_t *p    = &settled->massw;
 448     T              wh   = T(p->wh);
 449     T              rc   = T(p->rc);
 450     T              ra   = T(p->ra);
 451     T              rb   = T(p->rb);
 452     T              irc2 = T(p->irc2);
 453     T              mO   = T(p->mO);
 454     T              mH   = T(p->mH);
 455
 456     T              almost_zero = T(1e-12);
 457
 458     T              sum_r_m_dr[DIM][DIM];
 459
 460     if (bCalcVirial)
 461     {
 462         for (int d2 = 0; d2 < DIM; d2++)
 463         {
 464             for (int d = 0; d < DIM; d++)
 465             {
 466                 sum_r_m_dr[d2][d] = T(0);
 467             }
 468         }
 469     }
 470
 471     for (int i = settleStart; i < settleEnd; i += packSize)
 472     {
 473         /* Here we pad up to packSize with copies from the last valid entry.
 474          * This gives correct results, since we store (not increment) all
 475          * output, so we store the same output multiple times.
 476          */
 477         const int *ow1 = settled->ow1 + i;
 478         const int *hw2 = settled->hw2 + i;
 479         const int *hw3 = settled->hw3 + i;
 480
 481         T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
 482
 483         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
 484         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
 485         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
 486
 487         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
 488
 489         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
 490         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
 491         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
 492
 493         T dist21[DIM], dist31[DIM];
 494         T doh2[DIM], doh3[DIM];
 495         T sh_hw2[DIM], sh_hw3[DIM];
 496
 497         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
 498
 499         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
 500
 501         /* Tedious way of doing pbc */
 502         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
 503         for (int d = 0; d < DIM; d++)
 504         {
 505             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
 506             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
 507         }
 508         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
 509         for (int d = 0; d < DIM; d++)
 510         {
 511             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
 512             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
 513         }
 514
 515         /* Not calculating the center of mass using the oxygen position
 516          * and the O-H distances, as done below, will make SETTLE
 517          * the largest source of energy drift for simulations of water,
 518          * as then the oxygen coordinate is multiplied by 0.89 at every step,
 519          * which can then transfer a systematic rounding to the oxygen velocity.
 520          */
 521         T a1[DIM], com[DIM];
 522         for (int d = 0; d < DIM; d++)
 523         {
 524             a1[d]  = -(doh2[d] + doh3[d]) * wh;
 525             com[d] = xprime_ow1[d] - a1[d];
 526         }
 527         T b1[DIM];
 528         for (int d = 0; d < DIM; d++)
 529         {
 530             b1[d]  = xprime_hw2[d] - com[d];
 531         }
 532         T c1[DIM];
 533         for (int d = 0; d < DIM; d++)
 534         {
 535             c1[d]  = xprime_hw3[d] - com[d];
 536         }
 537         /* 15 flops */
 538
 539         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
 540         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
 541         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
 542         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
 543         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
 544         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
 545         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
 546         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
 547         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
 548         /* 27 flops */
 549
 550         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
 551         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
 552         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
 553
 554         T trns1[DIM], trns2[DIM], trns3[DIM];
 555
 556         trns1[XX] = xaksxd * axlng;
 557         trns2[XX] = yaksxd * axlng;
 558         trns3[XX] = zaksxd * axlng;
 559         trns1[YY] = xaksyd * aylng;
 560         trns2[YY] = yaksyd * aylng;
 561         trns3[YY] = zaksyd * aylng;
 562         trns1[ZZ] = xakszd * azlng;
 563         trns2[ZZ] = yakszd * azlng;
 564         trns3[ZZ] = zakszd * azlng;
 565         /* 24 flops */
 566
 567         T b0d[2], c0d[2];
 568
 569         for (int d = 0; d < 2; d++)
 570         {
 571             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
 572             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
 573         }
 574
 575         T a1d_z, b1d[DIM], c1d[DIM];
 576
 577         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
 578         for (int d = 0; d < DIM; d++)
 579         {
 580             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
 581             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
 582         }
 583         /* 65 flops */
 584
 585         T tmp, tmp2;
 586
 587         T sinphi = a1d_z * gmx::invsqrt(ra*ra);
 588         tmp2     = 1.0 - sinphi * sinphi;
 589
 590         /* If tmp2 gets close to or beyond zero we have severly distorted
 591          * water molecules and we should terminate the simulation.
 592          * Below we take the max with almost_zero to continue the loop.
 593          */
 594         bError   = bError || (tmp2 <= almost_zero);
 595
 596         tmp2     = max(tmp2, almost_zero);
 597         tmp      = gmx::invsqrt(tmp2);
 598         T cosphi = tmp2*tmp;
 599         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
 600         tmp2     = 1.0 - sinpsi * sinpsi;
 601
 602         T cospsi = tmp2*gmx::invsqrt(tmp2);
 603         /* 46 flops */
 604
 605         T a2d_y  =  ra * cosphi;
 606         T b2d_x  = -rc * cospsi;
 607         T t1     = -rb * cosphi;
 608         T t2     =  rc * sinpsi * sinphi;
 609         T b2d_y  =  t1 - t2;
 610         T c2d_y  =  t1 + t2;
 611         /* 7 flops */
 612
 613         /*     --- Step3  al,be,ga            --- */
 614         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
 615         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
 616         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
 617         T al2be2 = alpha * alpha + beta * beta;
 618         tmp2     = (al2be2 - gamma * gamma);
 619         T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
 620         /* 47 flops */
 621
 622         /*  --- Step4  A3' --- */
 623         tmp2     = 1.0 - sinthe * sinthe;
 624         T costhe = tmp2*gmx::invsqrt(tmp2);
 625
 626         T a3d[DIM], b3d[DIM], c3d[DIM];
 627
 628         a3d[XX]  = -a2d_y * sinthe;
 629         a3d[YY]  = a2d_y * costhe;
 630         a3d[ZZ]  = a1d_z;
 631         b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
 632         b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
 633         b3d[ZZ]  = b1d[ZZ];
 634         c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
 635         c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
 636         c3d[ZZ]  = c1d[ZZ];
 637         /* 26 flops */
 638
 639         /*    --- Step5  A3 --- */
 640         T a3[DIM], b3[DIM], c3[DIM];
 641
 642         a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
 643         a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
 644         a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
 645         b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
 646         b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
 647         b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
 648         c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
 649         c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
 650         c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
 651         /* 45 flops */
 652
 653         /* Compute and store the corrected new coordinate */
 654         for (int d = 0; d < DIM; d++)
 655         {
 656             xprime_ow1[d] = com[d] + a3[d];
 657         }
 658         for (int d = 0; d < DIM; d++)
 659         {
 660             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
 661         }
 662         for (int d = 0; d < DIM; d++)
 663         {
 664             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
 665         }
 666         /* 9 flops + 6 pbc flops */
 667
 668         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
 669         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
 670         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
 671
 672         // cppcheck-suppress duplicateExpression
 673         if (bCorrectVelocity || bCalcVirial)
 674         {
 675             T da[DIM], db[DIM], dc[DIM];
 676             for (int d = 0; d < DIM; d++)
 677             {
 678                 da[d] = a3[d] - a1[d];
 679             }
 680             for (int d = 0; d < DIM; d++)
 681             {
 682                 db[d] = b3[d] - b1[d];
 683             }
 684             for (int d = 0; d < DIM; d++)
 685             {
 686                 dc[d] = c3[d] - c1[d];
 687             }
 688             /* 9 flops */
 689
 690             if (bCorrectVelocity)
 691             {
 692                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
 693
 694                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
 695                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
 696                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
 697
 698                 /* Add the position correction divided by dt to the velocity */
 699                 for (int d = 0; d < DIM; d++)
 700                 {
 701                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
 702                 }
 703                 for (int d = 0; d < DIM; d++)
 704                 {
 705                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
 706                 }
 707                 for (int d = 0; d < DIM; d++)
 708                 {
 709                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
 710                 }
 711                 /* 3*6 flops */
 712
 713                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
 714                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
 715                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
 716             }
 717
 718             if (bCalcVirial)
 719             {
 720                 /* Filter out the non-local settles */
 721                 T filter = load<T>(settled->virfac + i);
 722                 T mOf    = filter*mO;
 723                 T mHf    = filter*mH;
 724
 725                 T mdo[DIM], mdb[DIM], mdc[DIM];
 726
 727                 for (int d = 0; d < DIM; d++)
 728                 {
 729                     mdb[d] = mHf*db[d];
 730                     mdc[d] = mHf*dc[d];
 731                     mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
 732                 }
 733
 734                 for (int d2 = 0; d2 < DIM; d2++)
 735                 {
 736                     for (int d = 0; d < DIM; d++)
 737                     {
 738                         sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
 739                             (x_ow1[d2]*mdo[d] +
 740                              dist21[d2]*mdb[d] +
 741                              dist31[d2]*mdc[d]);
 742                     }
 743                 }
 744                 /* 71 flops */
 745             }
 746         }
 747     }
 748
 749     if (bCalcVirial)
 750     {
 751         for (int d2 = 0; d2 < DIM; d2++)
 752         {
 753             for (int d = 0; d < DIM; d++)
 754             {
 755                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
 756             }
 757         }
 758     }
 759
 760     *bErrorHasOccurred = anyTrue(bError);
 761 }
 762
 763 /* Wrapper template function that divides the settles over threads
 764  * and instantiates the core template with instantiated booleans.
 765  */
 766 template<typename T, typename TypeBool, int packSize, typename TypePbc>
 767 static void settleTemplateWrapper(gmx_settledata_t settled,
 768                                   int nthread, int thread,
 769                                   TypePbc pbc,
 770                                   const real x[], real xprime[],
 771                                   real invdt, real *v,
 772                                   bool bCalcVirial, tensor vir_r_m_dr,
 773                                   bool *bErrorHasOccurred)
 774 {
 775     /* We need to assign settles to threads in groups of pack_size */
 776     int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
 777     /* Round the end value up to give thread 0 more work */
 778     int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
 779     int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
 780
 781     if (v != nullptr)
 782     {
 783         if (!bCalcVirial)
 784         {
 785             settleTemplate<T, TypeBool, packSize,
 786                            TypePbc,
 787                            true,
 788                            false>
 789                 (settled, settleStart, settleEnd,
 790                 pbc,
 791                 x, xprime,
 792                 invdt, v,
 793                 nullptr,
 794                 bErrorHasOccurred);
 795         }
 796         else
 797         {
 798             settleTemplate<T, TypeBool, packSize,
 799                            TypePbc,
 800                            true,
 801                            true>
 802                 (settled, settleStart, settleEnd,
 803                 pbc,
 804                 x, xprime,
 805                 invdt, v,
 806                 vir_r_m_dr,
 807                 bErrorHasOccurred);
 808         }
 809     }
 810     else
 811     {
 812         if (!bCalcVirial)
 813         {
 814             settleTemplate<T, TypeBool, packSize,
 815                            TypePbc,
 816                            false,
 817                            false>
 818                 (settled, settleStart, settleEnd,
 819                 pbc,
 820                 x, xprime,
 821                 invdt, v,
 822                 nullptr,
 823                 bErrorHasOccurred);
 824         }
 825         else
 826         {
 827             settleTemplate<T, TypeBool, packSize,
 828                            TypePbc,
 829                            false,
 830                            true>
 831                 (settled, settleStart, settleEnd,
 832                 pbc,
 833                 x, xprime,
 834                 invdt, v,
 835                 vir_r_m_dr,
 836                 bErrorHasOccurred);
 837         }
 838     }
 839 }
 840
 841 void csettle(gmx_settledata_t settled,
 842              int nthread, int thread,
 843              const t_pbc *pbc,
 844              const real x[], real xprime[],
 845              real invdt, real *v,
 846              bool bCalcVirial, tensor vir_r_m_dr,
 847              bool *bErrorHasOccurred)
 848 {
 849 #if GMX_SIMD_HAVE_REAL
 850     if (settled->bUseSimd)
 851     {
 852         /* Convert the pbc struct for SIMD */
 853         GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) pbcSimd[9*GMX_SIMD_REAL_WIDTH];
 854         set_pbc_simd(pbc, pbcSimd);
 855
 856         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
 857                               const real *>(settled,
 858                                             nthread, thread,
 859                                             pbcSimd,
 860                                             x, xprime,
 861                                             invdt,
 862                                             v,
 863                                             bCalcVirial, vir_r_m_dr,
 864                                             bErrorHasOccurred);
 865     }
 866     else
 867 #endif
 868     {
 869         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
 870         t_pbc        pbcNo;
 871         const t_pbc *pbcNonNull;
 872
 873         if (pbc != nullptr)
 874         {
 875             pbcNonNull = pbc;
 876         }
 877         else
 878         {
 879             set_pbc(&pbcNo, epbcNONE, nullptr);
 880             pbcNonNull = &pbcNo;
 881         }
 882
 883         settleTemplateWrapper<real, bool, 1,
 884                               const t_pbc *>(settled,
 885                                              nthread, thread,
 886                                              pbcNonNull,
 887                                              x, xprime,
 888                                              invdt,
 889                                              v,
 890                                              bCalcVirial, vir_r_m_dr,
 891                                              bErrorHasOccurred);
 892     }
 893 }