src/gromacs/mdlib/settle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief Defines SETTLE code.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_mdlib
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "settle.h"
  47
  48 #include <cassert>
  49 #include <cmath>
  50 #include <cstdio>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/math/functions.h"
  55 #include "gromacs/math/invertmatrix.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/constr.h"
  58 #include "gromacs/mdtypes/mdatom.h"
  59 #include "gromacs/pbcutil/ishift.h"
  60 #include "gromacs/pbcutil/pbc.h"
  61 #include "gromacs/pbcutil/pbc_simd.h"
  62 #include "gromacs/simd/simd.h"
  63 #include "gromacs/simd/simd_math.h"
  64 #include "gromacs/topology/idef.h"
  65 #include "gromacs/topology/ifunc.h"
  66 #include "gromacs/topology/mtop_util.h"
  67 #include "gromacs/utility/fatalerror.h"
  68 #include "gromacs/utility/gmxassert.h"
  69 #include "gromacs/utility/smalloc.h"
  70
  71 namespace gmx
  72 {
  73
  74 struct settleparam_t
  75 {
  76     real mO;
  77     real mH;
  78     real wh;
  79     real dOH;
  80     real dHH;
  81     real ra;
  82     real rb;
  83     real rc;
  84     real irc2;
  85     /* For projection */
  86     real   imO;
  87     real   imH;
  88     real   invdOH;
  89     real   invdHH;
  90     matrix invmat;
  91 };
  92
  93 struct settledata
  94 {
  95     settleparam_t massw; /* Parameters for SETTLE for coordinates */
  96     settleparam_t mass1; /* Parameters with all masses 1, for forces */
  97
  98     int   nsettle; /* The number of settles on our rank */
  99     int*  ow1;     /* Index to OW1 atoms, size nsettle + SIMD padding */
 100     int*  hw2;     /* Index to HW2 atoms, size nsettle + SIMD padding */
 101     int*  hw3;     /* Index to HW3 atoms, size nsettle + SIMD padding */
 102     real* virfac;  /* Virial factor 0 or 1, size nsettle + SIMD pad. */
 103     int   nalloc;  /* Allocation size of ow1, hw2, hw3, virfac */
 104
 105     bool bUseSimd; /* Use SIMD intrinsics code, if possible */
 106 };
 107
 108
 109 //! Initializes a projection matrix.
 110 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH, matrix inverseCouplingMatrix)
 111 {
 112     /* We normalize the inverse masses with invmO for the matrix inversion.
 113      * so we can keep using masses of almost zero for frozen particles,
 114      * without running out of the float range in invertMatrix.
 115      */
 116     double invmORelative = 1.0;
 117     double invmHRelative = invmH / static_cast<double>(invmO);
 118     double distanceRatio = dHH / static_cast<double>(dOH);
 119
 120     /* Construct the constraint coupling matrix */
 121     matrix mat;
 122     mat[0][0] = invmORelative + invmHRelative;
 123     mat[0][1] = invmORelative * (1.0 - 0.5 * gmx::square(distanceRatio));
 124     mat[0][2] = invmHRelative * 0.5 * distanceRatio;
 125     mat[1][1] = mat[0][0];
 126     mat[1][2] = mat[0][2];
 127     mat[2][2] = invmHRelative + invmHRelative;
 128     mat[1][0] = mat[0][1];
 129     mat[2][0] = mat[0][2];
 130     mat[2][1] = mat[1][2];
 131
 132     invertMatrix(mat, inverseCouplingMatrix);
 133
 134     msmul(inverseCouplingMatrix, 1 / invmO, inverseCouplingMatrix);
 135 }
 136
 137 //! Initializes settle parameters.
 138 static void settleparam_init(settleparam_t* p, real mO, real mH, real invmO, real invmH, real dOH, real dHH)
 139 {
 140     /* We calculate parameters in double precision to minimize errors.
 141      * The velocity correction applied during SETTLE coordinate constraining
 142      * introduces a systematic error of approximately 1 bit per atom,
 143      * depending on what the compiler does with the code.
 144      */
 145     double wohh;
 146
 147     p->mO     = mO;
 148     p->mH     = mH;
 149     wohh      = mO + 2.0 * mH;
 150     p->wh     = mH / wohh;
 151     p->dOH    = dOH;
 152     p->dHH    = dHH;
 153     double rc = dHH / 2.0;
 154     double ra = 2.0 * mH * std::sqrt(dOH * dOH - rc * rc) / wohh;
 155     p->rb     = std::sqrt(dOH * dOH - rc * rc) - ra;
 156     p->rc     = rc;
 157     p->ra     = ra;
 158     p->irc2   = 1.0 / dHH;
 159
 160     /* For projection: inverse masses and coupling matrix inversion */
 161     p->imO = invmO;
 162     p->imH = invmH;
 163
 164     p->invdOH = 1.0 / dOH;
 165     p->invdHH = 1.0 / dHH;
 166
 167     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
 168
 169     if (debug)
 170     {
 171         fprintf(debug, "wh =%g, rc = %g, ra = %g\n", p->wh, p->rc, p->ra);
 172         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n", p->rb, p->irc2, p->dHH, p->dOH);
 173     }
 174 }
 175
 176 settledata* settle_init(const gmx_mtop_t& mtop)
 177 {
 178     /* Check that we have only one settle type */
 179     int                  settle_type = -1;
 180     gmx_mtop_ilistloop_t iloop       = gmx_mtop_ilistloop_init(mtop);
 181     int                  nmol;
 182     const int            nral1 = 1 + NRAL(F_SETTLE);
 183     while (const InteractionLists* ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
 184     {
 185         const InteractionList& ilist = (*ilists)[F_SETTLE];
 186         for (int i = 0; i < ilist.size(); i += nral1)
 187         {
 188             if (settle_type == -1)
 189             {
 190                 settle_type = ilist.iatoms[i];
 191             }
 192             else if (ilist.iatoms[i] != settle_type)
 193             {
 194                 gmx_fatal(FARGS,
 195                           "The [molecules] section of your topology specifies more than one block "
 196                           "of\n"
 197                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
 198                           "If you are trying to partition your solvent into different *groups*\n"
 199                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong "
 200                           "approach. Index\n"
 201                           "files specify groups. Otherwise, you may wish to change the least-used\n"
 202                           "block of molecules with SETTLE constraints into 3 normal constraints.");
 203             }
 204         }
 205     }
 206     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
 207
 208     settledata* settled;
 209
 210     snew(settled, 1);
 211
 212     /* We will not initialize the normal SETTLE parameters here yet,
 213      * since the atom (inv)masses can depend on the integrator and
 214      * free-energy perturbation. We set mO=-1 to trigger later initialization.
 215      */
 216     settled->massw.mO = -1;
 217
 218     real dOH = mtop.ffparams.iparams[settle_type].settle.doh;
 219     real dHH = mtop.ffparams.iparams[settle_type].settle.dhh;
 220     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
 221
 222     settled->ow1    = nullptr;
 223     settled->hw2    = nullptr;
 224     settled->hw3    = nullptr;
 225     settled->virfac = nullptr;
 226     settled->nalloc = 0;
 227
 228     /* Without SIMD configured, this bool is not used */
 229     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr);
 230
 231     return settled;
 232 }
 233
 234 void settle_free(settledata* settled)
 235 {
 236     sfree_aligned(settled->ow1);
 237     sfree_aligned(settled->hw2);
 238     sfree_aligned(settled->hw3);
 239     sfree_aligned(settled->virfac);
 240     sfree(settled);
 241 }
 242
 243 void settle_set_constraints(settledata* settled, const t_ilist* il_settle, const t_mdatoms& mdatoms)
 244 {
 245 #if GMX_SIMD_HAVE_REAL
 246     const int pack_size = GMX_SIMD_REAL_WIDTH;
 247 #else
 248     const int pack_size = 1;
 249 #endif
 250
 251     const int nral1   = 1 + NRAL(F_SETTLE);
 252     int       nsettle = il_settle->nr / nral1;
 253     settled->nsettle  = nsettle;
 254
 255     if (nsettle > 0)
 256     {
 257         const t_iatom* iatoms = il_settle->iatoms;
 258
 259         /* Here we initialize the normal SETTLE parameters */
 260         if (settled->massw.mO < 0)
 261         {
 262             int firstO = iatoms[1];
 263             int firstH = iatoms[2];
 264             settleparam_init(&settled->massw, mdatoms.massT[firstO], mdatoms.massT[firstH],
 265                              mdatoms.invmass[firstO], mdatoms.invmass[firstH], settled->mass1.dOH,
 266                              settled->mass1.dHH);
 267         }
 268
 269         if (nsettle + pack_size > settled->nalloc)
 270         {
 271             settled->nalloc = over_alloc_dd(nsettle + pack_size);
 272             sfree_aligned(settled->ow1);
 273             sfree_aligned(settled->hw2);
 274             sfree_aligned(settled->hw3);
 275             sfree_aligned(settled->virfac);
 276             snew_aligned(settled->ow1, settled->nalloc, 64);
 277             snew_aligned(settled->hw2, settled->nalloc, 64);
 278             snew_aligned(settled->hw3, settled->nalloc, 64);
 279             snew_aligned(settled->virfac, settled->nalloc, 64);
 280         }
 281
 282         for (int i = 0; i < nsettle; i++)
 283         {
 284             settled->ow1[i] = iatoms[i * nral1 + 1];
 285             settled->hw2[i] = iatoms[i * nral1 + 2];
 286             settled->hw3[i] = iatoms[i * nral1 + 3];
 287             /* We should avoid double counting of virial contributions for
 288              * SETTLEs that appear in multiple DD domains, so we only count
 289              * the contribution on the home range of the oxygen atom.
 290              */
 291             settled->virfac[i] = (iatoms[i * nral1 + 1] < mdatoms.homenr ? 1 : 0);
 292         }
 293
 294         /* Pack the index array to the full SIMD width with copies from
 295          * the last normal entry, but with no virial contribution.
 296          */
 297         int end_packed = ((nsettle + pack_size - 1) / pack_size) * pack_size;
 298         for (int i = nsettle; i < end_packed; i++)
 299         {
 300             settled->ow1[i]    = settled->ow1[nsettle - 1];
 301             settled->hw2[i]    = settled->hw2[nsettle - 1];
 302             settled->hw3[i]    = settled->hw3[nsettle - 1];
 303             settled->virfac[i] = 0;
 304         }
 305     }
 306 }
 307
 308 void settle_proj(settledata*        settled,
 309                  ConstraintVariable econq,
 310                  int                nsettle,
 311                  const t_iatom      iatoms[],
 312                  const t_pbc*       pbc,
 313                  const rvec         x[],
 314                  rvec*              der,
 315                  rvec*              derp,
 316                  int                calcvir_atom_end,
 317                  tensor             vir_r_m_dder)
 318 {
 319     /* Settle for projection out constraint components
 320      * of derivatives of the coordinates.
 321      * Berk Hess 2008-1-10
 322      */
 323
 324     settleparam_t* p;
 325     real           imO, imH, dOH, dHH, invdOH, invdHH;
 326     matrix         invmat;
 327     int            i, m, m2, ow1, hw2, hw3;
 328     rvec           roh2, roh3, rhh, dc, fc;
 329
 330     calcvir_atom_end *= DIM;
 331
 332     if (econq == ConstraintVariable::Force)
 333     {
 334         p = &settled->mass1;
 335     }
 336     else
 337     {
 338         p = &settled->massw;
 339     }
 340     imO = p->imO;
 341     imH = p->imH;
 342     copy_mat(p->invmat, invmat);
 343     dOH    = p->dOH;
 344     dHH    = p->dHH;
 345     invdOH = p->invdOH;
 346     invdHH = p->invdHH;
 347
 348     const int nral1 = 1 + NRAL(F_SETTLE);
 349
 350     for (i = 0; i < nsettle; i++)
 351     {
 352         ow1 = iatoms[i * nral1 + 1];
 353         hw2 = iatoms[i * nral1 + 2];
 354         hw3 = iatoms[i * nral1 + 3];
 355
 356         if (pbc == nullptr)
 357         {
 358             rvec_sub(x[ow1], x[hw2], roh2);
 359             rvec_sub(x[ow1], x[hw3], roh3);
 360             rvec_sub(x[hw2], x[hw3], rhh);
 361         }
 362         else
 363         {
 364             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
 365             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
 366             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
 367         }
 368         svmul(invdOH, roh2, roh2);
 369         svmul(invdOH, roh3, roh3);
 370         svmul(invdHH, rhh, rhh);
 371         /* 18 flops */
 372
 373         /* Determine the projections of der on the bonds */
 374         clear_rvec(dc);
 375         for (m = 0; m < DIM; m++)
 376         {
 377             dc[0] += (der[ow1][m] - der[hw2][m]) * roh2[m];
 378             dc[1] += (der[ow1][m] - der[hw3][m]) * roh3[m];
 379             dc[2] += (der[hw2][m] - der[hw3][m]) * rhh[m];
 380         }
 381         /* 27 flops */
 382
 383         /* Determine the correction for the three bonds */
 384         mvmul(invmat, dc, fc);
 385         /* 15 flops */
 386
 387         /* Subtract the corrections from derp */
 388         for (m = 0; m < DIM; m++)
 389         {
 390             derp[ow1][m] -= imO * (fc[0] * roh2[m] + fc[1] * roh3[m]);
 391             derp[hw2][m] -= imH * (-fc[0] * roh2[m] + fc[2] * rhh[m]);
 392             derp[hw3][m] -= imH * (-fc[1] * roh3[m] - fc[2] * rhh[m]);
 393         }
 394
 395         /* 45 flops */
 396
 397         if (ow1 < calcvir_atom_end)
 398         {
 399             /* Determining r \dot m der is easy,
 400              * since fc contains the mass weighted corrections for der.
 401              */
 402
 403             for (m = 0; m < DIM; m++)
 404             {
 405                 for (m2 = 0; m2 < DIM; m2++)
 406                 {
 407                     vir_r_m_dder[m][m2] += dOH * roh2[m] * roh2[m2] * fc[0]
 408                                            + dOH * roh3[m] * roh3[m2] * fc[1]
 409                                            + dHH * rhh[m] * rhh[m2] * fc[2];
 410                 }
 411             }
 412         }
 413     }
 414 }
 415
 416
 417 /*! \brief The actual settle code, templated for real/SimdReal and for optimization */
 418 template<typename T, typename TypeBool, int packSize, typename TypePbc, bool bCorrectVelocity, bool bCalcVirial>
 419 static void settleTemplate(const settledata* settled,
 420                            int               settleStart,
 421                            int               settleEnd,
 422                            const TypePbc     pbc,
 423                            const real*       x,
 424                            real*             xprime,
 425                            real              invdt,
 426                            real* gmx_restrict v,
 427                            tensor             vir_r_m_dr,
 428                            bool*              bErrorHasOccurred)
 429 {
 430     /* ******************************************************************* */
 431     /*                                                                  ** */
 432     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
 433     /*                                                                  ** */
 434     /*    Algorithm changes by Berk Hess:                               ** */
 435     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
 436     /*    2006-10-16 Changed velocity update to use differences         ** */
 437     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
 438     /*    2016-02    Complete rewrite of the code for SIMD              ** */
 439     /*                                                                  ** */
 440     /*    Reference for the SETTLE algorithm                            ** */
 441     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
 442     /*                                                                  ** */
 443     /* ******************************************************************* */
 444
 445     assert(settleStart % packSize == 0);
 446     assert(settleEnd % packSize == 0);
 447
 448     TypeBool bError = TypeBool(false);
 449
 450     const settleparam_t* p    = &settled->massw;
 451     T                    wh   = T(p->wh);
 452     T                    rc   = T(p->rc);
 453     T                    ra   = T(p->ra);
 454     T                    rb   = T(p->rb);
 455     T                    irc2 = T(p->irc2);
 456     T                    mO   = T(p->mO);
 457     T                    mH   = T(p->mH);
 458
 459     T almost_zero = T(1e-12);
 460
 461     T sum_r_m_dr[DIM][DIM];
 462
 463     if (bCalcVirial)
 464     {
 465         for (int d2 = 0; d2 < DIM; d2++)
 466         {
 467             for (int d = 0; d < DIM; d++)
 468             {
 469                 sum_r_m_dr[d2][d] = T(0);
 470             }
 471         }
 472     }
 473
 474     for (int i = settleStart; i < settleEnd; i += packSize)
 475     {
 476         /* Here we pad up to packSize with copies from the last valid entry.
 477          * This gives correct results, since we store (not increment) all
 478          * output, so we store the same output multiple times.
 479          */
 480         const int* ow1 = settled->ow1 + i;
 481         const int* hw2 = settled->hw2 + i;
 482         const int* hw3 = settled->hw3 + i;
 483
 484         T x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
 485
 486         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
 487         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
 488         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
 489
 490         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
 491
 492         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
 493         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
 494         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
 495
 496         T dist21[DIM], dist31[DIM];
 497         T doh2[DIM], doh3[DIM];
 498         T sh_hw2[DIM], sh_hw3[DIM];
 499
 500         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
 501
 502         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
 503
 504         /* Tedious way of doing pbc */
 505         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
 506         for (int d = 0; d < DIM; d++)
 507         {
 508             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
 509             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
 510         }
 511         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
 512         for (int d = 0; d < DIM; d++)
 513         {
 514             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
 515             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
 516         }
 517
 518         /* Not calculating the center of mass using the oxygen position
 519          * and the O-H distances, as done below, will make SETTLE
 520          * the largest source of energy drift for simulations of water,
 521          * as then the oxygen coordinate is multiplied by 0.89 at every step,
 522          * which can then transfer a systematic rounding to the oxygen velocity.
 523          */
 524         T a1[DIM], com[DIM];
 525         for (int d = 0; d < DIM; d++)
 526         {
 527             a1[d]  = -(doh2[d] + doh3[d]) * wh;
 528             com[d] = xprime_ow1[d] - a1[d];
 529         }
 530         T b1[DIM];
 531         for (int d = 0; d < DIM; d++)
 532         {
 533             b1[d] = xprime_hw2[d] - com[d];
 534         }
 535         T c1[DIM];
 536         for (int d = 0; d < DIM; d++)
 537         {
 538             c1[d] = xprime_hw3[d] - com[d];
 539         }
 540         /* 15 flops */
 541
 542         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
 543         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
 544         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
 545         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
 546         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
 547         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
 548         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
 549         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
 550         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
 551         /* 27 flops */
 552
 553         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
 554         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
 555         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
 556
 557         T trns1[DIM], trns2[DIM], trns3[DIM];
 558
 559         trns1[XX] = xaksxd * axlng;
 560         trns2[XX] = yaksxd * axlng;
 561         trns3[XX] = zaksxd * axlng;
 562         trns1[YY] = xaksyd * aylng;
 563         trns2[YY] = yaksyd * aylng;
 564         trns3[YY] = zaksyd * aylng;
 565         trns1[ZZ] = xakszd * azlng;
 566         trns2[ZZ] = yakszd * azlng;
 567         trns3[ZZ] = zakszd * azlng;
 568         /* 24 flops */
 569
 570         T b0d[2], c0d[2];
 571
 572         for (int d = 0; d < 2; d++)
 573         {
 574             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
 575             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
 576         }
 577
 578         T a1d_z, b1d[DIM], c1d[DIM];
 579
 580         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
 581         for (int d = 0; d < DIM; d++)
 582         {
 583             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
 584             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
 585         }
 586         /* 65 flops */
 587
 588         T tmp, tmp2;
 589
 590         T sinphi = a1d_z * gmx::invsqrt(ra * ra);
 591         tmp2     = 1.0 - sinphi * sinphi;
 592
 593         /* If tmp2 gets close to or beyond zero we have severly distorted
 594          * water molecules and we should terminate the simulation.
 595          * Below we take the max with almost_zero to continue the loop.
 596          */
 597         bError = bError || (tmp2 <= almost_zero);
 598
 599         tmp2     = max(tmp2, almost_zero);
 600         tmp      = gmx::invsqrt(tmp2);
 601         T cosphi = tmp2 * tmp;
 602         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
 603         tmp2     = 1.0 - sinpsi * sinpsi;
 604
 605         T cospsi = tmp2 * gmx::invsqrt(tmp2);
 606         /* 46 flops */
 607
 608         T a2d_y = ra * cosphi;
 609         T b2d_x = -rc * cospsi;
 610         T t1    = -rb * cosphi;
 611         T t2    = rc * sinpsi * sinphi;
 612         T b2d_y = t1 - t2;
 613         T c2d_y = t1 + t2;
 614         /* 7 flops */
 615
 616         /*     --- Step3  al,be,ga            --- */
 617         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
 618         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
 619         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
 620         T al2be2 = alpha * alpha + beta * beta;
 621         tmp2     = (al2be2 - gamma * gamma);
 622         T sinthe = (alpha * gamma - beta * tmp2 * gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2 * al2be2);
 623         /* 47 flops */
 624
 625         /*  --- Step4  A3' --- */
 626         tmp2     = 1.0 - sinthe * sinthe;
 627         T costhe = tmp2 * gmx::invsqrt(tmp2);
 628
 629         T a3d[DIM], b3d[DIM], c3d[DIM];
 630
 631         a3d[XX] = -a2d_y * sinthe;
 632         a3d[YY] = a2d_y * costhe;
 633         a3d[ZZ] = a1d_z;
 634         b3d[XX] = b2d_x * costhe - b2d_y * sinthe;
 635         b3d[YY] = b2d_x * sinthe + b2d_y * costhe;
 636         b3d[ZZ] = b1d[ZZ];
 637         c3d[XX] = -b2d_x * costhe - c2d_y * sinthe;
 638         c3d[YY] = -b2d_x * sinthe + c2d_y * costhe;
 639         c3d[ZZ] = c1d[ZZ];
 640         /* 26 flops */
 641
 642         /*    --- Step5  A3 --- */
 643         T a3[DIM], b3[DIM], c3[DIM];
 644
 645         a3[XX] = trns1[XX] * a3d[XX] + trns1[YY] * a3d[YY] + trns1[ZZ] * a3d[ZZ];
 646         a3[YY] = trns2[XX] * a3d[XX] + trns2[YY] * a3d[YY] + trns2[ZZ] * a3d[ZZ];
 647         a3[ZZ] = trns3[XX] * a3d[XX] + trns3[YY] * a3d[YY] + trns3[ZZ] * a3d[ZZ];
 648         b3[XX] = trns1[XX] * b3d[XX] + trns1[YY] * b3d[YY] + trns1[ZZ] * b3d[ZZ];
 649         b3[YY] = trns2[XX] * b3d[XX] + trns2[YY] * b3d[YY] + trns2[ZZ] * b3d[ZZ];
 650         b3[ZZ] = trns3[XX] * b3d[XX] + trns3[YY] * b3d[YY] + trns3[ZZ] * b3d[ZZ];
 651         c3[XX] = trns1[XX] * c3d[XX] + trns1[YY] * c3d[YY] + trns1[ZZ] * c3d[ZZ];
 652         c3[YY] = trns2[XX] * c3d[XX] + trns2[YY] * c3d[YY] + trns2[ZZ] * c3d[ZZ];
 653         c3[ZZ] = trns3[XX] * c3d[XX] + trns3[YY] * c3d[YY] + trns3[ZZ] * c3d[ZZ];
 654         /* 45 flops */
 655
 656         /* Compute and store the corrected new coordinate */
 657         for (int d = 0; d < DIM; d++)
 658         {
 659             xprime_ow1[d] = com[d] + a3[d];
 660         }
 661         for (int d = 0; d < DIM; d++)
 662         {
 663             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];
 664         }
 665         for (int d = 0; d < DIM; d++)
 666         {
 667             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
 668         }
 669         /* 9 flops + 6 pbc flops */
 670
 671         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
 672         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
 673         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
 674
 675         if (bCorrectVelocity || bCalcVirial)
 676         {
 677             T da[DIM], db[DIM], dc[DIM];
 678             for (int d = 0; d < DIM; d++)
 679             {
 680                 da[d] = a3[d] - a1[d];
 681             }
 682             for (int d = 0; d < DIM; d++)
 683             {
 684                 db[d] = b3[d] - b1[d];
 685             }
 686             for (int d = 0; d < DIM; d++)
 687             {
 688                 dc[d] = c3[d] - c1[d];
 689             }
 690             /* 9 flops */
 691
 692             if (bCorrectVelocity)
 693             {
 694                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
 695
 696                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
 697                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
 698                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
 699
 700                 /* Add the position correction divided by dt to the velocity */
 701                 for (int d = 0; d < DIM; d++)
 702                 {
 703                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
 704                 }
 705                 for (int d = 0; d < DIM; d++)
 706                 {
 707                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
 708                 }
 709                 for (int d = 0; d < DIM; d++)
 710                 {
 711                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
 712                 }
 713                 /* 3*6 flops */
 714
 715                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
 716                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
 717                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
 718             }
 719
 720             if (bCalcVirial)
 721             {
 722                 /* Filter out the non-local settles */
 723                 T filter = load<T>(settled->virfac + i);
 724                 T mOf    = filter * mO;
 725                 T mHf    = filter * mH;
 726
 727                 T mdo[DIM], mdb[DIM], mdc[DIM];
 728
 729                 for (int d = 0; d < DIM; d++)
 730                 {
 731                     mdb[d] = mHf * db[d];
 732                     mdc[d] = mHf * dc[d];
 733                     mdo[d] = mOf * da[d] + mdb[d] + mdc[d];
 734                 }
 735
 736                 for (int d2 = 0; d2 < DIM; d2++)
 737                 {
 738                     for (int d = 0; d < DIM; d++)
 739                     {
 740                         sum_r_m_dr[d2][d] =
 741                                 sum_r_m_dr[d2][d]
 742                                 - (x_ow1[d2] * mdo[d] + dist21[d2] * mdb[d] + dist31[d2] * mdc[d]);
 743                     }
 744                 }
 745                 /* 71 flops */
 746             }
 747         }
 748     }
 749
 750     if (bCalcVirial)
 751     {
 752         for (int d2 = 0; d2 < DIM; d2++)
 753         {
 754             for (int d = 0; d < DIM; d++)
 755             {
 756                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
 757             }
 758         }
 759     }
 760
 761     *bErrorHasOccurred = anyTrue(bError);
 762 }
 763
 764 /*! \brief Wrapper template function that divides the settles over threads
 765  * and instantiates the core template with instantiated booleans.
 766  */
 767 template<typename T, typename TypeBool, int packSize, typename TypePbc>
 768 static void settleTemplateWrapper(settledata* settled,
 769                                   int         nthread,
 770                                   int         thread,
 771                                   TypePbc     pbc,
 772                                   const real  x[],
 773                                   real        xprime[],
 774                                   real        invdt,
 775                                   real*       v,
 776                                   bool        bCalcVirial,
 777                                   tensor      vir_r_m_dr,
 778                                   bool*       bErrorHasOccurred)
 779 {
 780     /* We need to assign settles to threads in groups of pack_size */
 781     int numSettlePacks = (settled->nsettle + packSize - 1) / packSize;
 782     /* Round the end value up to give thread 0 more work */
 783     int settleStart = ((numSettlePacks * thread + nthread - 1) / nthread) * packSize;
 784     int settleEnd   = ((numSettlePacks * (thread + 1) + nthread - 1) / nthread) * packSize;
 785
 786     if (v != nullptr)
 787     {
 788         if (!bCalcVirial)
 789         {
 790             settleTemplate<T, TypeBool, packSize, TypePbc, true, false>(
 791                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, nullptr, bErrorHasOccurred);
 792         }
 793         else
 794         {
 795             settleTemplate<T, TypeBool, packSize, TypePbc, true, true>(
 796                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, vir_r_m_dr, bErrorHasOccurred);
 797         }
 798     }
 799     else
 800     {
 801         if (!bCalcVirial)
 802         {
 803             settleTemplate<T, TypeBool, packSize, TypePbc, false, false>(
 804                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, nullptr, bErrorHasOccurred);
 805         }
 806         else
 807         {
 808             settleTemplate<T, TypeBool, packSize, TypePbc, false, true>(
 809                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, vir_r_m_dr, bErrorHasOccurred);
 810         }
 811     }
 812 }
 813
 814 void csettle(settledata*  settled,
 815              int          nthread,
 816              int          thread,
 817              const t_pbc* pbc,
 818              const real   x[],
 819              real         xprime[],
 820              real         invdt,
 821              real*        v,
 822              bool         bCalcVirial,
 823              tensor       vir_r_m_dr,
 824              bool*        bErrorHasOccurred)
 825 {
 826 #if GMX_SIMD_HAVE_REAL
 827     if (settled->bUseSimd)
 828     {
 829         /* Convert the pbc struct for SIMD */
 830         alignas(GMX_SIMD_ALIGNMENT) real pbcSimd[9 * GMX_SIMD_REAL_WIDTH];
 831         set_pbc_simd(pbc, pbcSimd);
 832
 833         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH, const real*>(
 834                 settled, nthread, thread, pbcSimd, x, xprime, invdt, v, bCalcVirial, vir_r_m_dr,
 835                 bErrorHasOccurred);
 836     }
 837     else
 838 #endif
 839     {
 840         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
 841         t_pbc        pbcNo;
 842         const t_pbc* pbcNonNull;
 843
 844         if (pbc != nullptr)
 845         {
 846             pbcNonNull = pbc;
 847         }
 848         else
 849         {
 850             set_pbc(&pbcNo, epbcNONE, nullptr);
 851             pbcNonNull = &pbcNo;
 852         }
 853
 854         settleTemplateWrapper<real, bool, 1, const t_pbc*>(settled, nthread, thread, pbcNonNull, x,
 855                                                            xprime, invdt, v, bCalcVirial,
 856                                                            vir_r_m_dr, bErrorHasOccurred);
 857     }
 858 }
 859
 860 } // namespace gmx