src/gromacs/mdlib/settle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  * \brief Defines SETTLE code.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Mark Abraham <mark.j.abraham@gmail.com>
  43  * \ingroup module_mdlib
  44  */
  45 #include "gmxpre.h"
  46
  47 #include "settle.h"
  48
  49 #include <cassert>
  50 #include <cmath>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/math/arrayrefwithpadding.h"
  56 #include "gromacs/math/functions.h"
  57 #include "gromacs/math/invertmatrix.h"
  58 #include "gromacs/math/vec.h"
  59 #include "gromacs/mdlib/constr.h"
  60 #include "gromacs/mdtypes/mdatom.h"
  61 #include "gromacs/pbcutil/ishift.h"
  62 #include "gromacs/pbcutil/pbc.h"
  63 #include "gromacs/pbcutil/pbc_simd.h"
  64 #include "gromacs/simd/simd.h"
  65 #include "gromacs/simd/simd_math.h"
  66 #include "gromacs/topology/idef.h"
  67 #include "gromacs/topology/ifunc.h"
  68 #include "gromacs/topology/mtop_util.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxassert.h"
  71
  72 namespace gmx
  73 {
  74
  75 /*! \brief Initializes a projection matrix.
  76  *
  77  * \param[in]  invmO                  Reciprocal oxygen mass
  78  * \param[in]  invmH                  Reciprocal hydrogen mass
  79  * \param[in]  dOH                    Target O-H bond length
  80  * \param[in]  dHH                    Target H-H bond length
  81  * \param[out] inverseCouplingMatrix  Inverse bond coupling matrix for the projection version of SETTLE
  82  */
  83 static void initializeProjectionMatrix(const real invmO,
  84                                        const real invmH,
  85                                        const real dOH,
  86                                        const real dHH,
  87                                        matrix     inverseCouplingMatrix)
  88 {
  89     // We normalize the inverse masses with invmO for the matrix inversion.
  90     // so we can keep using masses of almost zero for frozen particles,
  91     // without running out of the float range in invertMatrix.
  92     double invmORelative = 1.0;
  93     double invmHRelative = invmH / static_cast<double>(invmO);
  94     double distanceRatio = dHH / static_cast<double>(dOH);
  95
  96     /* Construct the constraint coupling matrix */
  97     matrix mat;
  98     mat[0][0] = invmORelative + invmHRelative;
  99     mat[0][1] = invmORelative * (1.0 - 0.5 * gmx::square(distanceRatio));
 100     mat[0][2] = invmHRelative * 0.5 * distanceRatio;
 101     mat[1][1] = mat[0][0];
 102     mat[1][2] = mat[0][2];
 103     mat[2][2] = invmHRelative + invmHRelative;
 104     mat[1][0] = mat[0][1];
 105     mat[2][0] = mat[0][2];
 106     mat[2][1] = mat[1][2];
 107
 108     invertMatrix(mat, inverseCouplingMatrix);
 109
 110     msmul(inverseCouplingMatrix, 1 / invmO, inverseCouplingMatrix);
 111 }
 112
 113 SettleParameters
 114 settleParameters(const real mO, const real mH, const real invmO, const real invmH, const real dOH, const real dHH)
 115 {
 116     SettleParameters params;
 117
 118     // We calculate parameters in double precision to minimize errors.
 119     // The velocity correction applied during SETTLE coordinate constraining
 120     // introduces a systematic error of approximately 1 bit per atom,
 121     // depending on what the compiler does with the code.
 122     double wohh;
 123
 124     params.mO   = mO;
 125     params.mH   = mH;
 126     wohh        = mO + 2.0 * mH;
 127     params.wh   = mH / wohh;
 128     params.dOH  = dOH;
 129     params.dHH  = dHH;
 130     double rc   = dHH / 2.0;
 131     double ra   = 2.0 * mH * std::sqrt(dOH * dOH - rc * rc) / wohh;
 132     params.rb   = std::sqrt(dOH * dOH - rc * rc) - ra;
 133     params.rc   = rc;
 134     params.ra   = ra;
 135     params.irc2 = 1.0 / dHH;
 136
 137     // For projection: inverse masses and coupling matrix inversion
 138     params.imO = invmO;
 139     params.imH = invmH;
 140
 141     params.invdOH = 1.0 / dOH;
 142     params.invdHH = 1.0 / dHH;
 143
 144     initializeProjectionMatrix(invmO, invmH, dOH, dHH, params.invmat);
 145
 146     if (debug)
 147     {
 148         fprintf(debug, "wh =%g, rc = %g, ra = %g\n", params.wh, params.rc, params.ra);
 149         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n", params.rb, params.irc2, params.dHH, params.dOH);
 150     }
 151
 152     return params;
 153 }
 154
 155 SettleData::SettleData(const gmx_mtop_t& mtop) :
 156     useSimd_(getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr)
 157 {
 158     /* Check that we have only one settle type */
 159     int                  settle_type = -1;
 160     gmx_mtop_ilistloop_t iloop       = gmx_mtop_ilistloop_init(mtop);
 161     int                  nmol;
 162     const int            nral1 = 1 + NRAL(F_SETTLE);
 163     while (const InteractionLists* ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
 164     {
 165         const InteractionList& ilist = (*ilists)[F_SETTLE];
 166         for (int i = 0; i < ilist.size(); i += nral1)
 167         {
 168             if (settle_type == -1)
 169             {
 170                 settle_type = ilist.iatoms[i];
 171             }
 172             else if (ilist.iatoms[i] != settle_type)
 173             {
 174                 gmx_fatal(FARGS,
 175                           "The [molecules] section of your topology specifies more than one block "
 176                           "of\n"
 177                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
 178                           "If you are trying to partition your solvent into different *groups*\n"
 179                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong "
 180                           "approach. Index\n"
 181                           "files specify groups. Otherwise, you may wish to change the least-used\n"
 182                           "block of molecules with SETTLE constraints into 3 normal constraints.");
 183             }
 184         }
 185     }
 186     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
 187
 188     /* We will not initialize the normal SETTLE parameters here yet,
 189      * since the atom (inv)masses can depend on the integrator and
 190      * free-energy perturbation. We set mO=-1 to trigger later initialization.
 191      */
 192     parametersMassWeighted_.mO = -1;
 193
 194     real dOH              = mtop.ffparams.iparams[settle_type].settle.doh;
 195     real dHH              = mtop.ffparams.iparams[settle_type].settle.dhh;
 196     parametersAllMasses1_ = settleParameters(1.0, 1.0, 1.0, 1.0, dOH, dHH);
 197 }
 198
 199 void SettleData::setConstraints(const InteractionList& il_settle,
 200                                 const int              numHomeAtoms,
 201                                 const real*            masses,
 202                                 const real*            inverseMasses)
 203 {
 204 #if GMX_SIMD_HAVE_REAL
 205     const int pack_size = GMX_SIMD_REAL_WIDTH;
 206 #else
 207     const int pack_size = 1;
 208 #endif
 209
 210     const int nral1   = 1 + NRAL(F_SETTLE);
 211     int       nsettle = il_settle.size() / nral1;
 212     numSettles_       = nsettle;
 213
 214     if (nsettle > 0)
 215     {
 216         ArrayRef<const int> iatoms = il_settle.iatoms;
 217
 218         /* Here we initialize the normal SETTLE parameters */
 219         if (parametersMassWeighted_.mO < 0)
 220         {
 221             int firstO              = iatoms[1];
 222             int firstH              = iatoms[2];
 223             parametersMassWeighted_ = settleParameters(masses[firstO],
 224                                                        masses[firstH],
 225                                                        inverseMasses[firstO],
 226                                                        inverseMasses[firstH],
 227                                                        parametersAllMasses1_.dOH,
 228                                                        parametersAllMasses1_.dHH);
 229         }
 230
 231         const int paddedSize = ((nsettle + pack_size - 1) / pack_size) * pack_size;
 232         ow1_.resize(paddedSize);
 233         hw2_.resize(paddedSize);
 234         hw3_.resize(paddedSize);
 235         virfac_.resize(paddedSize);
 236
 237         for (int i = 0; i < nsettle; i++)
 238         {
 239             ow1_[i] = iatoms[i * nral1 + 1];
 240             hw2_[i] = iatoms[i * nral1 + 2];
 241             hw3_[i] = iatoms[i * nral1 + 3];
 242             /* We should avoid double counting of virial contributions for
 243              * SETTLEs that appear in multiple DD domains, so we only count
 244              * the contribution on the home range of the oxygen atom.
 245              */
 246             virfac_[i] = (iatoms[i * nral1 + 1] < numHomeAtoms ? 1 : 0);
 247         }
 248
 249         /* Pad the index array to the full SIMD width with copies from
 250          * the last normal entry, but with no virial contribution.
 251          */
 252         for (int i = nsettle; i < paddedSize; i++)
 253         {
 254             ow1_[i]    = ow1_[nsettle - 1];
 255             hw2_[i]    = hw2_[nsettle - 1];
 256             hw3_[i]    = hw3_[nsettle - 1];
 257             virfac_[i] = 0;
 258         }
 259     }
 260 }
 261
 262 void settle_proj(const SettleData&    settled,
 263                  ConstraintVariable   econq,
 264                  int                  nsettle,
 265                  const t_iatom        iatoms[],
 266                  const t_pbc*         pbc,
 267                  ArrayRef<const RVec> x,
 268                  ArrayRef<RVec>       der,
 269                  ArrayRef<RVec>       derp,
 270                  int                  calcvir_atom_end,
 271                  tensor               vir_r_m_dder)
 272 {
 273     /* Settle for projection out constraint components
 274      * of derivatives of the coordinates.
 275      * Berk Hess 2008-1-10
 276      */
 277
 278     const SettleParameters* p;
 279     real                    imO, imH, dOH, dHH, invdOH, invdHH;
 280     matrix                  invmat;
 281     int                     i, m, m2, ow1, hw2, hw3;
 282     rvec                    roh2, roh3, rhh, dc, fc;
 283
 284     calcvir_atom_end *= DIM;
 285
 286     if (econq == ConstraintVariable::Force)
 287     {
 288         p = &settled.parametersAllMasses1();
 289     }
 290     else
 291     {
 292         p = &settled.parametersMassWeighted();
 293     }
 294     imO = p->imO;
 295     imH = p->imH;
 296     copy_mat(p->invmat, invmat);
 297     dOH    = p->dOH;
 298     dHH    = p->dHH;
 299     invdOH = p->invdOH;
 300     invdHH = p->invdHH;
 301
 302     const int nral1 = 1 + NRAL(F_SETTLE);
 303
 304     for (i = 0; i < nsettle; i++)
 305     {
 306         ow1 = iatoms[i * nral1 + 1];
 307         hw2 = iatoms[i * nral1 + 2];
 308         hw3 = iatoms[i * nral1 + 3];
 309
 310         if (pbc == nullptr)
 311         {
 312             rvec_sub(x[ow1], x[hw2], roh2);
 313             rvec_sub(x[ow1], x[hw3], roh3);
 314             rvec_sub(x[hw2], x[hw3], rhh);
 315         }
 316         else
 317         {
 318             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
 319             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
 320             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
 321         }
 322         svmul(invdOH, roh2, roh2);
 323         svmul(invdOH, roh3, roh3);
 324         svmul(invdHH, rhh, rhh);
 325         /* 18 flops */
 326
 327         /* Determine the projections of der on the bonds */
 328         clear_rvec(dc);
 329         for (m = 0; m < DIM; m++)
 330         {
 331             dc[0] += (der[ow1][m] - der[hw2][m]) * roh2[m];
 332             dc[1] += (der[ow1][m] - der[hw3][m]) * roh3[m];
 333             dc[2] += (der[hw2][m] - der[hw3][m]) * rhh[m];
 334         }
 335         /* 27 flops */
 336
 337         /* Determine the correction for the three bonds */
 338         mvmul(invmat, dc, fc);
 339         /* 15 flops */
 340
 341         /* Subtract the corrections from derp */
 342         for (m = 0; m < DIM; m++)
 343         {
 344             derp[ow1][m] -= imO * (fc[0] * roh2[m] + fc[1] * roh3[m]);
 345             derp[hw2][m] -= imH * (-fc[0] * roh2[m] + fc[2] * rhh[m]);
 346             derp[hw3][m] -= imH * (-fc[1] * roh3[m] - fc[2] * rhh[m]);
 347         }
 348
 349         /* 45 flops */
 350
 351         if (ow1 < calcvir_atom_end)
 352         {
 353             /* Determining r \dot m der is easy,
 354              * since fc contains the mass weighted corrections for der.
 355              */
 356
 357             for (m = 0; m < DIM; m++)
 358             {
 359                 for (m2 = 0; m2 < DIM; m2++)
 360                 {
 361                     vir_r_m_dder[m][m2] += dOH * roh2[m] * roh2[m2] * fc[0]
 362                                            + dOH * roh3[m] * roh3[m2] * fc[1]
 363                                            + dHH * rhh[m] * rhh[m2] * fc[2];
 364                 }
 365             }
 366         }
 367     }
 368 }
 369
 370
 371 /*! \brief The actual settle code, templated for real/SimdReal and for optimization */
 372 template<typename T, typename TypeBool, int packSize, typename TypePbc, bool bCorrectVelocity, bool bCalcVirial>
 373 static void settleTemplate(const SettleData& settled,
 374                            int               settleStart,
 375                            int               settleEnd,
 376                            const TypePbc     pbc,
 377                            const real*       x,
 378                            real*             xprime,
 379                            real              invdt,
 380                            real* gmx_restrict v,
 381                            tensor             vir_r_m_dr,
 382                            bool*              bErrorHasOccurred)
 383 {
 384     /* ******************************************************************* */
 385     /*                                                                  ** */
 386     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
 387     /*                                                                  ** */
 388     /*    Algorithm changes by Berk Hess:                               ** */
 389     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
 390     /*    2006-10-16 Changed velocity update to use differences         ** */
 391     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
 392     /*    2016-02    Complete rewrite of the code for SIMD              ** */
 393     /*    2020-06    Completely remove use of COM to minimize drift     ** */
 394     /*                                                                  ** */
 395     /*    Reference for the SETTLE algorithm                            ** */
 396     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
 397     /*                                                                  ** */
 398     /* ******************************************************************* */
 399
 400     assert(settleStart % packSize == 0);
 401     assert(settleEnd % packSize == 0);
 402
 403     TypeBool bError = TypeBool(false);
 404
 405     const SettleParameters* p    = &settled.parametersMassWeighted();
 406     T                       wh   = T(p->wh);
 407     T                       rc   = T(p->rc);
 408     T                       ra   = T(p->ra);
 409     T                       rb   = T(p->rb);
 410     T                       irc2 = T(p->irc2);
 411     T                       mO   = T(p->mO);
 412     T                       mH   = T(p->mH);
 413
 414     T almost_zero = T(1e-12);
 415
 416     T sum_r_m_dr[DIM][DIM];
 417
 418     if (bCalcVirial)
 419     {
 420         for (int d2 = 0; d2 < DIM; d2++)
 421         {
 422             for (int d = 0; d < DIM; d++)
 423             {
 424                 sum_r_m_dr[d2][d] = T(0);
 425             }
 426         }
 427     }
 428
 429     for (int i = settleStart; i < settleEnd; i += packSize)
 430     {
 431         /* Here we pad up to packSize with copies from the last valid entry.
 432          * This gives correct results, since we store (not increment) all
 433          * output, so we store the same output multiple times.
 434          */
 435         const int* ow1 = settled.ow1() + i;
 436         const int* hw2 = settled.hw2() + i;
 437         const int* hw3 = settled.hw3() + i;
 438
 439         T x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
 440
 441         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
 442         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
 443         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
 444
 445         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
 446
 447         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
 448         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
 449         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
 450
 451         T dist21[DIM], dist31[DIM];
 452         T doh2[DIM], doh3[DIM];
 453
 454         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
 455
 456         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
 457
 458         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
 459
 460         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
 461         /* 4 * 18 flops (would be 4 * 3 without PBC) */
 462
 463         /* Note that we completely avoid computing the center of mass and
 464          * only use distances. This minimizes energy drift and also makes
 465          * the computation slightly cheaper.
 466          * Straightforward computation of the COM, as in the original algorithm,
 467          * makes SETTLE the largest source of energy drift for simulations of water,
 468          * as then the oxygen coordinate is multiplied by 0.89 at every step,
 469          * which can then transfer a systematic rounding to the oxygen velocity.
 470          * For some time we computed the COM using offsets from the oxygen, this
 471          * significantly reduces the energy drift, but not using the COM at all,
 472          * as we do now, is optimal.
 473          */
 474         T a1[DIM];
 475         for (int d = 0; d < DIM; d++)
 476         {
 477             a1[d] = -(doh2[d] + doh3[d]) * wh;
 478         }
 479         T b1[DIM];
 480         for (int d = 0; d < DIM; d++)
 481         {
 482             b1[d] = doh2[d] + a1[d];
 483         }
 484         T c1[DIM];
 485         for (int d = 0; d < DIM; d++)
 486         {
 487             c1[d] = doh3[d] + a1[d];
 488         }
 489         /* 12 flops */
 490
 491         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
 492         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
 493         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
 494         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
 495         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
 496         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
 497         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
 498         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
 499         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
 500         /* 27 flops */
 501
 502         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
 503         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
 504         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
 505
 506         T trns1[DIM], trns2[DIM], trns3[DIM];
 507
 508         trns1[XX] = xaksxd * axlng;
 509         trns2[XX] = yaksxd * axlng;
 510         trns3[XX] = zaksxd * axlng;
 511         trns1[YY] = xaksyd * aylng;
 512         trns2[YY] = yaksyd * aylng;
 513         trns3[YY] = zaksyd * aylng;
 514         trns1[ZZ] = xakszd * azlng;
 515         trns2[ZZ] = yakszd * azlng;
 516         trns3[ZZ] = zakszd * azlng;
 517         /* 24 flops */
 518
 519         T b0d[2], c0d[2];
 520
 521         for (int d = 0; d < 2; d++)
 522         {
 523             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
 524             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
 525         }
 526
 527         T a1d_z, b1d[DIM], c1d[DIM];
 528
 529         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
 530         for (int d = 0; d < DIM; d++)
 531         {
 532             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
 533             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
 534         }
 535         /* 65 flops */
 536
 537         T tmp, tmp2;
 538
 539         T sinphi = a1d_z * gmx::invsqrt(ra * ra);
 540         tmp2     = 1.0 - sinphi * sinphi;
 541
 542         /* If tmp2 gets close to or beyond zero we have severly distorted
 543          * water molecules and we should terminate the simulation.
 544          * Below we take the max with almost_zero to continue the loop.
 545          */
 546         bError = bError || (tmp2 <= almost_zero);
 547
 548         tmp2     = max(tmp2, almost_zero);
 549         tmp      = gmx::invsqrt(tmp2);
 550         T cosphi = tmp2 * tmp;
 551         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
 552         tmp2     = 1.0 - sinpsi * sinpsi;
 553
 554         T cospsi = tmp2 * gmx::invsqrt(tmp2);
 555         /* 46 flops */
 556
 557         T a2d_y = ra * cosphi;
 558         T b2d_x = -rc * cospsi;
 559         T t1    = -rb * cosphi;
 560         T t2    = rc * sinpsi * sinphi;
 561         T b2d_y = t1 - t2;
 562         T c2d_y = t1 + t2;
 563         /* 7 flops */
 564
 565         /*     --- Step3  al,be,ga            --- */
 566         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
 567         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
 568         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
 569         T al2be2 = alpha * alpha + beta * beta;
 570         tmp2     = (al2be2 - gamma * gamma);
 571         T sinthe = (alpha * gamma - beta * tmp2 * gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2 * al2be2);
 572         /* 47 flops */
 573
 574         /*  --- Step4  A3' --- */
 575         tmp2     = 1.0 - sinthe * sinthe;
 576         T costhe = tmp2 * gmx::invsqrt(tmp2);
 577
 578         T a3d[DIM], b3d[DIM], c3d[DIM];
 579
 580         a3d[XX] = -a2d_y * sinthe;
 581         a3d[YY] = a2d_y * costhe;
 582         a3d[ZZ] = a1d_z;
 583         b3d[XX] = b2d_x * costhe - b2d_y * sinthe;
 584         b3d[YY] = b2d_x * sinthe + b2d_y * costhe;
 585         b3d[ZZ] = b1d[ZZ];
 586         c3d[XX] = -b2d_x * costhe - c2d_y * sinthe;
 587         c3d[YY] = -b2d_x * sinthe + c2d_y * costhe;
 588         c3d[ZZ] = c1d[ZZ];
 589         /* 26 flops */
 590
 591         /*    --- Step5  A3 --- */
 592         T a3[DIM], b3[DIM], c3[DIM];
 593
 594         a3[XX] = trns1[XX] * a3d[XX] + trns1[YY] * a3d[YY] + trns1[ZZ] * a3d[ZZ];
 595         a3[YY] = trns2[XX] * a3d[XX] + trns2[YY] * a3d[YY] + trns2[ZZ] * a3d[ZZ];
 596         a3[ZZ] = trns3[XX] * a3d[XX] + trns3[YY] * a3d[YY] + trns3[ZZ] * a3d[ZZ];
 597         b3[XX] = trns1[XX] * b3d[XX] + trns1[YY] * b3d[YY] + trns1[ZZ] * b3d[ZZ];
 598         b3[YY] = trns2[XX] * b3d[XX] + trns2[YY] * b3d[YY] + trns2[ZZ] * b3d[ZZ];
 599         b3[ZZ] = trns3[XX] * b3d[XX] + trns3[YY] * b3d[YY] + trns3[ZZ] * b3d[ZZ];
 600         c3[XX] = trns1[XX] * c3d[XX] + trns1[YY] * c3d[YY] + trns1[ZZ] * c3d[ZZ];
 601         c3[YY] = trns2[XX] * c3d[XX] + trns2[YY] * c3d[YY] + trns2[ZZ] * c3d[ZZ];
 602         c3[ZZ] = trns3[XX] * c3d[XX] + trns3[YY] * c3d[YY] + trns3[ZZ] * c3d[ZZ];
 603         /* 45 flops */
 604
 605         /* Compute and store the corrected new coordinate */
 606         T dxOw1[DIM];
 607         for (int d = 0; d < DIM; d++)
 608         {
 609             dxOw1[d]      = a3[d] - a1[d];
 610             xprime_ow1[d] = xprime_ow1[d] + dxOw1[d];
 611         }
 612         T dxHw2[DIM];
 613         for (int d = 0; d < DIM; d++)
 614         {
 615             dxHw2[d]      = b3[d] - b1[d];
 616             xprime_hw2[d] = xprime_hw2[d] + dxHw2[d];
 617         }
 618         T dxHw3[DIM];
 619         for (int d = 0; d < DIM; d++)
 620         {
 621             dxHw3[d]      = c3[d] - c1[d];
 622             xprime_hw3[d] = xprime_hw3[d] + dxHw3[d];
 623         }
 624         /* 9 + 9 flops */
 625
 626         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
 627         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
 628         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
 629
 630         if (bCorrectVelocity)
 631         {
 632             T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
 633
 634             gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
 635             gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
 636             gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
 637
 638             /* Add the position correction divided by dt to the velocity */
 639             for (int d = 0; d < DIM; d++)
 640             {
 641                 v_ow1[d] = gmx::fma(dxOw1[d], invdt, v_ow1[d]);
 642             }
 643             for (int d = 0; d < DIM; d++)
 644             {
 645                 v_hw2[d] = gmx::fma(dxHw2[d], invdt, v_hw2[d]);
 646             }
 647             for (int d = 0; d < DIM; d++)
 648             {
 649                 v_hw3[d] = gmx::fma(dxHw3[d], invdt, v_hw3[d]);
 650             }
 651             /* 3*6 flops */
 652
 653             transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
 654             transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
 655             transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
 656         }
 657
 658         if (bCalcVirial)
 659         {
 660             /* Filter out the non-local settles */
 661             T filter = load<T>(settled.virfac() + i);
 662             T mOf    = filter * mO;
 663             T mHf    = filter * mH;
 664
 665             T mdo[DIM], mdb[DIM], mdc[DIM];
 666
 667             for (int d = 0; d < DIM; d++)
 668             {
 669                 mdb[d] = mHf * dxHw2[d];
 670                 mdc[d] = mHf * dxHw3[d];
 671                 mdo[d] = mOf * dxOw1[d] + mdb[d] + mdc[d];
 672             }
 673
 674             for (int d2 = 0; d2 < DIM; d2++)
 675             {
 676                 for (int d = 0; d < DIM; d++)
 677                 {
 678                     sum_r_m_dr[d2][d] =
 679                             sum_r_m_dr[d2][d]
 680                             - (x_ow1[d2] * mdo[d] + dist21[d2] * mdb[d] + dist31[d2] * mdc[d]);
 681                 }
 682             }
 683             /* 71 flops */
 684         }
 685     }
 686
 687     if (bCalcVirial)
 688     {
 689         for (int d2 = 0; d2 < DIM; d2++)
 690         {
 691             for (int d = 0; d < DIM; d++)
 692             {
 693                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
 694             }
 695         }
 696     }
 697
 698     *bErrorHasOccurred = anyTrue(bError);
 699 }
 700
 701 /*! \brief Wrapper template function that divides the settles over threads
 702  * and instantiates the core template with instantiated booleans.
 703  */
 704 template<typename T, typename TypeBool, int packSize, typename TypePbc>
 705 static void settleTemplateWrapper(const SettleData& settled,
 706                                   int               nthread,
 707                                   int               thread,
 708                                   TypePbc           pbc,
 709                                   const real        x[],
 710                                   real              xprime[],
 711                                   real              invdt,
 712                                   real*             v,
 713                                   bool              bCalcVirial,
 714                                   tensor            vir_r_m_dr,
 715                                   bool*             bErrorHasOccurred)
 716 {
 717     /* We need to assign settles to threads in groups of pack_size */
 718     int numSettlePacks = (settled.numSettles() + packSize - 1) / packSize;
 719     /* Round the end value up to give thread 0 more work */
 720     int settleStart = ((numSettlePacks * thread + nthread - 1) / nthread) * packSize;
 721     int settleEnd   = ((numSettlePacks * (thread + 1) + nthread - 1) / nthread) * packSize;
 722
 723     if (v != nullptr)
 724     {
 725         if (!bCalcVirial)
 726         {
 727             settleTemplate<T, TypeBool, packSize, TypePbc, true, false>(
 728                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, nullptr, bErrorHasOccurred);
 729         }
 730         else
 731         {
 732             settleTemplate<T, TypeBool, packSize, TypePbc, true, true>(
 733                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, vir_r_m_dr, bErrorHasOccurred);
 734         }
 735     }
 736     else
 737     {
 738         if (!bCalcVirial)
 739         {
 740             settleTemplate<T, TypeBool, packSize, TypePbc, false, false>(
 741                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, nullptr, bErrorHasOccurred);
 742         }
 743         else
 744         {
 745             settleTemplate<T, TypeBool, packSize, TypePbc, false, true>(
 746                     settled, settleStart, settleEnd, pbc, x, xprime, invdt, v, vir_r_m_dr, bErrorHasOccurred);
 747         }
 748     }
 749 }
 750
 751 void csettle(const SettleData&               settled,
 752              int                             nthread,
 753              int                             thread,
 754              const t_pbc*                    pbc,
 755              ArrayRefWithPadding<const RVec> x,
 756              ArrayRefWithPadding<RVec>       xprime,
 757              real                            invdt,
 758              ArrayRefWithPadding<RVec>       v,
 759              bool                            bCalcVirial,
 760              tensor                          vir_r_m_dr,
 761              bool*                           bErrorHasOccurred)
 762 {
 763     const real* xPtr      = as_rvec_array(x.paddedArrayRef().data())[0];
 764     real*       xprimePtr = as_rvec_array(xprime.paddedArrayRef().data())[0];
 765     real*       vPtr      = as_rvec_array(v.paddedArrayRef().data())[0];
 766
 767 #if GMX_SIMD_HAVE_REAL
 768     if (settled.useSimd())
 769     {
 770         /* Convert the pbc struct for SIMD */
 771         alignas(GMX_SIMD_ALIGNMENT) real pbcSimd[9 * GMX_SIMD_REAL_WIDTH];
 772         set_pbc_simd(pbc, pbcSimd);
 773
 774         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH, const real*>(
 775                 settled, nthread, thread, pbcSimd, xPtr, xprimePtr, invdt, vPtr, bCalcVirial, vir_r_m_dr, bErrorHasOccurred);
 776     }
 777     else
 778 #endif
 779     {
 780         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
 781         t_pbc        pbcNo;
 782         const t_pbc* pbcNonNull;
 783
 784         if (pbc != nullptr)
 785         {
 786             pbcNonNull = pbc;
 787         }
 788         else
 789         {
 790             set_pbc(&pbcNo, PbcType::No, nullptr);
 791             pbcNonNull = &pbcNo;
 792         }
 793
 794         settleTemplateWrapper<real, bool, 1, const t_pbc*>(
 795                 settled, nthread, thread, pbcNonNull, &xPtr[0], &xprimePtr[0], invdt, &vPtr[0], bCalcVirial, vir_r_m_dr, bErrorHasOccurred);
 796     }
 797 }
 798
 799 } // namespace gmx