src/gromacs/mdlib/settle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief Defines SETTLE code.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_mdlib
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "settle.h"
  47
  48 #include <assert.h>
  49 #include <stdio.h>
  50
  51 #include <cmath>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/math/functions.h"
  56 #include "gromacs/math/invertmatrix.h"
  57 #include "gromacs/math/vec.h"
  58 #include "gromacs/mdlib/constr.h"
  59 #include "gromacs/mdtypes/mdatom.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/pbcutil/pbc.h"
  62 #include "gromacs/pbcutil/pbc-simd.h"
  63 #include "gromacs/simd/simd.h"
  64 #include "gromacs/simd/simd_math.h"
  65 #include "gromacs/topology/idef.h"
  66 #include "gromacs/topology/ifunc.h"
  67 #include "gromacs/topology/mtop_util.h"
  68 #include "gromacs/utility/fatalerror.h"
  69 #include "gromacs/utility/gmxassert.h"
  70 #include "gromacs/utility/smalloc.h"
  71
  72 namespace gmx
  73 {
  74
  75 struct settleparam_t
  76 {
  77     real   mO;
  78     real   mH;
  79     real   wh;
  80     real   dOH;
  81     real   dHH;
  82     real   ra;
  83     real   rb;
  84     real   rc;
  85     real   irc2;
  86     /* For projection */
  87     real   imO;
  88     real   imH;
  89     real   invdOH;
  90     real   invdHH;
  91     matrix invmat;
  92 };
  93
  94 struct settledata
  95 {
  96     settleparam_t massw;    /* Parameters for SETTLE for coordinates */
  97     settleparam_t mass1;    /* Parameters with all masses 1, for forces */
  98
  99     int           nsettle;  /* The number of settles on our rank */
 100     int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
 101     int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
 102     int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
 103     real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
 104     int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
 105
 106     bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
 107 };
 108
 109
 110 //! Initializes a projection matrix.
 111 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH,
 112                              matrix inverseCouplingMatrix)
 113 {
 114     /* We normalize the inverse masses with invmO for the matrix inversion.
 115      * so we can keep using masses of almost zero for frozen particles,
 116      * without running out of the float range in invertMatrix.
 117      */
 118     double invmORelative = 1.0;
 119     double invmHRelative = invmH/static_cast<double>(invmO);
 120     double distanceRatio = dHH/static_cast<double>(dOH);
 121
 122     /* Construct the constraint coupling matrix */
 123     matrix mat;
 124     mat[0][0] = invmORelative + invmHRelative;
 125     mat[0][1] = invmORelative*(1.0 - 0.5*gmx::square(distanceRatio));
 126     mat[0][2] = invmHRelative*0.5*distanceRatio;
 127     mat[1][1] = mat[0][0];
 128     mat[1][2] = mat[0][2];
 129     mat[2][2] = invmHRelative + invmHRelative;
 130     mat[1][0] = mat[0][1];
 131     mat[2][0] = mat[0][2];
 132     mat[2][1] = mat[1][2];
 133
 134     invertMatrix(mat, inverseCouplingMatrix);
 135
 136     msmul(inverseCouplingMatrix, 1/invmO, inverseCouplingMatrix);
 137 }
 138
 139 //! Initializes settle parameters.
 140 static void settleparam_init(settleparam_t *p,
 141                              real mO, real mH, real invmO, real invmH,
 142                              real dOH, real dHH)
 143 {
 144     /* We calculate parameters in double precision to minimize errors.
 145      * The velocity correction applied during SETTLE coordinate constraining
 146      * introduces a systematic error of approximately 1 bit per atom,
 147      * depending on what the compiler does with the code.
 148      */
 149     double wohh;
 150
 151     p->mO     = mO;
 152     p->mH     = mH;
 153     wohh      = mO + 2.0*mH;
 154     p->wh     = mH/wohh;
 155     p->dOH    = dOH;
 156     p->dHH    = dHH;
 157     double rc = dHH/2.0;
 158     double ra = 2.0*mH*std::sqrt(dOH*dOH - rc*rc)/wohh;
 159     p->rb     = std::sqrt(dOH*dOH - rc*rc) - ra;
 160     p->rc     = rc;
 161     p->ra     = ra;
 162     p->irc2   = 1.0/dHH;
 163
 164     /* For projection: inverse masses and coupling matrix inversion */
 165     p->imO    = invmO;
 166     p->imH    = invmH;
 167
 168     p->invdOH = 1.0/dOH;
 169     p->invdHH = 1.0/dHH;
 170
 171     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
 172
 173     if (debug)
 174     {
 175         fprintf(debug, "wh =%g, rc = %g, ra = %g\n",
 176                 p->wh, p->rc, p->ra);
 177         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
 178                 p->rb, p->irc2, p->dHH, p->dOH);
 179     }
 180 }
 181
 182 settledata *settle_init(const gmx_mtop_t &mtop)
 183 {
 184     /* Check that we have only one settle type */
 185     int                   settle_type = -1;
 186     gmx_mtop_ilistloop_t  iloop       = gmx_mtop_ilistloop_init(mtop);
 187     const t_ilist        *ilist;
 188     int                   nmol;
 189     const int             nral1       = 1 + NRAL(F_SETTLE);
 190     while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
 191     {
 192         for (int i = 0; i < ilist[F_SETTLE].nr; i += nral1)
 193         {
 194             if (settle_type == -1)
 195             {
 196                 settle_type = ilist[F_SETTLE].iatoms[i];
 197             }
 198             else if (ilist[F_SETTLE].iatoms[i] != settle_type)
 199             {
 200                 gmx_fatal(FARGS,
 201                           "The [molecules] section of your topology specifies more than one block of\n"
 202                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
 203                           "If you are trying to partition your solvent into different *groups*\n"
 204                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
 205                           "files specify groups. Otherwise, you may wish to change the least-used\n"
 206                           "block of molecules with SETTLE constraints into 3 normal constraints.");
 207             }
 208         }
 209     }
 210     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
 211
 212     settledata *settled;
 213
 214     snew(settled, 1);
 215
 216     /* We will not initialize the normal SETTLE parameters here yet,
 217      * since the atom (inv)masses can depend on the integrator and
 218      * free-energy perturbation. We set mO=-1 to trigger later initialization.
 219      */
 220     settled->massw.mO = -1;
 221
 222     real dOH = mtop.ffparams.iparams[settle_type].settle.doh;
 223     real dHH = mtop.ffparams.iparams[settle_type].settle.dhh;
 224     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
 225
 226     settled->ow1    = nullptr;
 227     settled->hw2    = nullptr;
 228     settled->hw3    = nullptr;
 229     settled->virfac = nullptr;
 230     settled->nalloc = 0;
 231
 232     /* Without SIMD configured, this bool is not used */
 233     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr);
 234
 235     return settled;
 236 }
 237
 238 void settle_free(settledata *settled)
 239 {
 240     sfree_aligned(settled->ow1);
 241     sfree_aligned(settled->hw2);
 242     sfree_aligned(settled->hw3);
 243     sfree_aligned(settled->virfac);
 244     sfree(settled);
 245 }
 246
 247 void settle_set_constraints(settledata       *settled,
 248                             const t_ilist    *il_settle,
 249                             const t_mdatoms  &mdatoms)
 250 {
 251 #if GMX_SIMD_HAVE_REAL
 252     const int pack_size = GMX_SIMD_REAL_WIDTH;
 253 #else
 254     const int pack_size = 1;
 255 #endif
 256
 257     const int nral1     = 1 + NRAL(F_SETTLE);
 258     int       nsettle   = il_settle->nr/nral1;
 259     settled->nsettle = nsettle;
 260
 261     if (nsettle > 0)
 262     {
 263         const t_iatom *iatoms = il_settle->iatoms;
 264
 265         /* Here we initialize the normal SETTLE parameters */
 266         if (settled->massw.mO < 0)
 267         {
 268             int firstO = iatoms[1];
 269             int firstH = iatoms[2];
 270             settleparam_init(&settled->massw,
 271                              mdatoms.massT[firstO],
 272                              mdatoms.massT[firstH],
 273                              mdatoms.invmass[firstO],
 274                              mdatoms.invmass[firstH],
 275                              settled->mass1.dOH,
 276                              settled->mass1.dHH);
 277         }
 278
 279         if (nsettle + pack_size > settled->nalloc)
 280         {
 281             settled->nalloc = over_alloc_dd(nsettle + pack_size);
 282             sfree_aligned(settled->ow1);
 283             sfree_aligned(settled->hw2);
 284             sfree_aligned(settled->hw3);
 285             sfree_aligned(settled->virfac);
 286             snew_aligned(settled->ow1, settled->nalloc, 64);
 287             snew_aligned(settled->hw2, settled->nalloc, 64);
 288             snew_aligned(settled->hw3, settled->nalloc, 64);
 289             snew_aligned(settled->virfac, settled->nalloc, 64);
 290         }
 291
 292         for (int i = 0; i < nsettle; i++)
 293         {
 294             settled->ow1[i]    = iatoms[i*nral1 + 1];
 295             settled->hw2[i]    = iatoms[i*nral1 + 2];
 296             settled->hw3[i]    = iatoms[i*nral1 + 3];
 297             /* We should avoid double counting of virial contributions for
 298              * SETTLEs that appear in multiple DD domains, so we only count
 299              * the contribution on the home range of the oxygen atom.
 300              */
 301             settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms.homenr ? 1 : 0);
 302         }
 303
 304         /* Pack the index array to the full SIMD width with copies from
 305          * the last normal entry, but with no virial contribution.
 306          */
 307         int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
 308         for (int i = nsettle; i < end_packed; i++)
 309         {
 310             settled->ow1[i]    = settled->ow1[nsettle - 1];
 311             settled->hw2[i]    = settled->hw2[nsettle - 1];
 312             settled->hw3[i]    = settled->hw3[nsettle - 1];
 313             settled->virfac[i] = 0;
 314         }
 315     }
 316 }
 317
 318 void settle_proj(settledata *settled, ConstraintVariable econq,
 319                  int nsettle, t_iatom iatoms[],
 320                  const t_pbc *pbc,
 321                  const rvec x[],
 322                  rvec *der, rvec *derp,
 323                  int calcvir_atom_end, tensor vir_r_m_dder)
 324 {
 325     /* Settle for projection out constraint components
 326      * of derivatives of the coordinates.
 327      * Berk Hess 2008-1-10
 328      */
 329
 330     settleparam_t *p;
 331     real           imO, imH, dOH, dHH, invdOH, invdHH;
 332     matrix         invmat;
 333     int            i, m, m2, ow1, hw2, hw3;
 334     rvec           roh2, roh3, rhh, dc, fc;
 335
 336     calcvir_atom_end *= DIM;
 337
 338     if (econq == ConstraintVariable::Force)
 339     {
 340         p = &settled->mass1;
 341     }
 342     else
 343     {
 344         p = &settled->massw;
 345     }
 346     imO    = p->imO;
 347     imH    = p->imH;
 348     copy_mat(p->invmat, invmat);
 349     dOH    = p->dOH;
 350     dHH    = p->dHH;
 351     invdOH = p->invdOH;
 352     invdHH = p->invdHH;
 353
 354     const int nral1 = 1 + NRAL(F_SETTLE);
 355
 356     for (i = 0; i < nsettle; i++)
 357     {
 358         ow1 = iatoms[i*nral1 + 1];
 359         hw2 = iatoms[i*nral1 + 2];
 360         hw3 = iatoms[i*nral1 + 3];
 361
 362         if (pbc == nullptr)
 363         {
 364             rvec_sub(x[ow1], x[hw2], roh2);
 365             rvec_sub(x[ow1], x[hw3], roh3);
 366             rvec_sub(x[hw2], x[hw3], rhh);
 367         }
 368         else
 369         {
 370             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
 371             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
 372             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
 373         }
 374         svmul(invdOH, roh2, roh2);
 375         svmul(invdOH, roh3, roh3);
 376         svmul(invdHH, rhh, rhh);
 377         /* 18 flops */
 378
 379         /* Determine the projections of der on the bonds */
 380         clear_rvec(dc);
 381         for (m = 0; m < DIM; m++)
 382         {
 383             dc[0] += (der[ow1][m] - der[hw2][m])*roh2[m];
 384             dc[1] += (der[ow1][m] - der[hw3][m])*roh3[m];
 385             dc[2] += (der[hw2][m] - der[hw3][m])*rhh [m];
 386         }
 387         /* 27 flops */
 388
 389         /* Determine the correction for the three bonds */
 390         mvmul(invmat, dc, fc);
 391         /* 15 flops */
 392
 393         /* Subtract the corrections from derp */
 394         for (m = 0; m < DIM; m++)
 395         {
 396             derp[ow1][m] -= imO*( fc[0]*roh2[m] + fc[1]*roh3[m]);
 397             derp[hw2][m] -= imH*(-fc[0]*roh2[m] + fc[2]*rhh [m]);
 398             derp[hw3][m] -= imH*(-fc[1]*roh3[m] - fc[2]*rhh [m]);
 399         }
 400
 401         /* 45 flops */
 402
 403         if (ow1 < calcvir_atom_end)
 404         {
 405             /* Determining r \dot m der is easy,
 406              * since fc contains the mass weighted corrections for der.
 407              */
 408
 409             for (m = 0; m < DIM; m++)
 410             {
 411                 for (m2 = 0; m2 < DIM; m2++)
 412                 {
 413                     vir_r_m_dder[m][m2] +=
 414                         dOH*roh2[m]*roh2[m2]*fc[0] +
 415                         dOH*roh3[m]*roh3[m2]*fc[1] +
 416                         dHH*rhh [m]*rhh [m2]*fc[2];
 417                 }
 418             }
 419         }
 420     }
 421 }
 422
 423
 424 /*! \brief The actual settle code, templated for real/SimdReal and for optimization */
 425 template<typename T, typename TypeBool, int packSize,
 426          typename TypePbc,
 427          bool bCorrectVelocity,
 428          bool bCalcVirial>
 429 static void settleTemplate(const settledata *settled,
 430                            int settleStart, int settleEnd,
 431                            const TypePbc pbc,
 432                            const real *x, real *xprime,
 433                            real invdt, real * gmx_restrict v,
 434                            tensor vir_r_m_dr,
 435                            bool *bErrorHasOccurred)
 436 {
 437     /* ******************************************************************* */
 438     /*                                                                  ** */
 439     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
 440     /*                                                                  ** */
 441     /*    Algorithm changes by Berk Hess:                               ** */
 442     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
 443     /*    2006-10-16 Changed velocity update to use differences         ** */
 444     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
 445     /*    2016-02    Complete rewrite of the code for SIMD              ** */
 446     /*                                                                  ** */
 447     /*    Reference for the SETTLE algorithm                            ** */
 448     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
 449     /*                                                                  ** */
 450     /* ******************************************************************* */
 451
 452     assert(settleStart % packSize == 0);
 453     assert(settleEnd   % packSize == 0);
 454
 455     TypeBool             bError = TypeBool(false);
 456
 457     const settleparam_t *p    = &settled->massw;
 458     T                    wh   = T(p->wh);
 459     T                    rc   = T(p->rc);
 460     T                    ra   = T(p->ra);
 461     T                    rb   = T(p->rb);
 462     T                    irc2 = T(p->irc2);
 463     T                    mO   = T(p->mO);
 464     T                    mH   = T(p->mH);
 465
 466     T                    almost_zero = T(1e-12);
 467
 468     T                    sum_r_m_dr[DIM][DIM];
 469
 470     if (bCalcVirial)
 471     {
 472         for (int d2 = 0; d2 < DIM; d2++)
 473         {
 474             for (int d = 0; d < DIM; d++)
 475             {
 476                 sum_r_m_dr[d2][d] = T(0);
 477             }
 478         }
 479     }
 480
 481     for (int i = settleStart; i < settleEnd; i += packSize)
 482     {
 483         /* Here we pad up to packSize with copies from the last valid entry.
 484          * This gives correct results, since we store (not increment) all
 485          * output, so we store the same output multiple times.
 486          */
 487         const int *ow1 = settled->ow1 + i;
 488         const int *hw2 = settled->hw2 + i;
 489         const int *hw3 = settled->hw3 + i;
 490
 491         T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
 492
 493         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
 494         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
 495         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
 496
 497         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
 498
 499         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
 500         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
 501         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
 502
 503         T dist21[DIM], dist31[DIM];
 504         T doh2[DIM], doh3[DIM];
 505         T sh_hw2[DIM], sh_hw3[DIM];
 506
 507         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
 508
 509         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
 510
 511         /* Tedious way of doing pbc */
 512         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
 513         for (int d = 0; d < DIM; d++)
 514         {
 515             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
 516             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
 517         }
 518         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
 519         for (int d = 0; d < DIM; d++)
 520         {
 521             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
 522             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
 523         }
 524
 525         /* Not calculating the center of mass using the oxygen position
 526          * and the O-H distances, as done below, will make SETTLE
 527          * the largest source of energy drift for simulations of water,
 528          * as then the oxygen coordinate is multiplied by 0.89 at every step,
 529          * which can then transfer a systematic rounding to the oxygen velocity.
 530          */
 531         T a1[DIM], com[DIM];
 532         for (int d = 0; d < DIM; d++)
 533         {
 534             a1[d]  = -(doh2[d] + doh3[d]) * wh;
 535             com[d] = xprime_ow1[d] - a1[d];
 536         }
 537         T b1[DIM];
 538         for (int d = 0; d < DIM; d++)
 539         {
 540             b1[d]  = xprime_hw2[d] - com[d];
 541         }
 542         T c1[DIM];
 543         for (int d = 0; d < DIM; d++)
 544         {
 545             c1[d]  = xprime_hw3[d] - com[d];
 546         }
 547         /* 15 flops */
 548
 549         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
 550         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
 551         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
 552         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
 553         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
 554         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
 555         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
 556         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
 557         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
 558         /* 27 flops */
 559
 560         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
 561         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
 562         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
 563
 564         T trns1[DIM], trns2[DIM], trns3[DIM];
 565
 566         trns1[XX] = xaksxd * axlng;
 567         trns2[XX] = yaksxd * axlng;
 568         trns3[XX] = zaksxd * axlng;
 569         trns1[YY] = xaksyd * aylng;
 570         trns2[YY] = yaksyd * aylng;
 571         trns3[YY] = zaksyd * aylng;
 572         trns1[ZZ] = xakszd * azlng;
 573         trns2[ZZ] = yakszd * azlng;
 574         trns3[ZZ] = zakszd * azlng;
 575         /* 24 flops */
 576
 577         T b0d[2], c0d[2];
 578
 579         for (int d = 0; d < 2; d++)
 580         {
 581             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
 582             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
 583         }
 584
 585         T a1d_z, b1d[DIM], c1d[DIM];
 586
 587         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
 588         for (int d = 0; d < DIM; d++)
 589         {
 590             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
 591             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
 592         }
 593         /* 65 flops */
 594
 595         T tmp, tmp2;
 596
 597         T sinphi = a1d_z * gmx::invsqrt(ra*ra);
 598         tmp2     = 1.0 - sinphi * sinphi;
 599
 600         /* If tmp2 gets close to or beyond zero we have severly distorted
 601          * water molecules and we should terminate the simulation.
 602          * Below we take the max with almost_zero to continue the loop.
 603          */
 604         bError   = bError || (tmp2 <= almost_zero);
 605
 606         tmp2     = max(tmp2, almost_zero);
 607         tmp      = gmx::invsqrt(tmp2);
 608         T cosphi = tmp2*tmp;
 609         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
 610         tmp2     = 1.0 - sinpsi * sinpsi;
 611
 612         T cospsi = tmp2*gmx::invsqrt(tmp2);
 613         /* 46 flops */
 614
 615         T a2d_y  =  ra * cosphi;
 616         T b2d_x  = -rc * cospsi;
 617         T t1     = -rb * cosphi;
 618         T t2     =  rc * sinpsi * sinphi;
 619         T b2d_y  =  t1 - t2;
 620         T c2d_y  =  t1 + t2;
 621         /* 7 flops */
 622
 623         /*     --- Step3  al,be,ga            --- */
 624         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
 625         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
 626         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
 627         T al2be2 = alpha * alpha + beta * beta;
 628         tmp2     = (al2be2 - gamma * gamma);
 629         T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
 630         /* 47 flops */
 631
 632         /*  --- Step4  A3' --- */
 633         tmp2     = 1.0 - sinthe * sinthe;
 634         T costhe = tmp2*gmx::invsqrt(tmp2);
 635
 636         T a3d[DIM], b3d[DIM], c3d[DIM];
 637
 638         a3d[XX]  = -a2d_y * sinthe;
 639         a3d[YY]  = a2d_y * costhe;
 640         a3d[ZZ]  = a1d_z;
 641         b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
 642         b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
 643         b3d[ZZ]  = b1d[ZZ];
 644         c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
 645         c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
 646         c3d[ZZ]  = c1d[ZZ];
 647         /* 26 flops */
 648
 649         /*    --- Step5  A3 --- */
 650         T a3[DIM], b3[DIM], c3[DIM];
 651
 652         a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
 653         a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
 654         a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
 655         b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
 656         b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
 657         b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
 658         c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
 659         c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
 660         c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
 661         /* 45 flops */
 662
 663         /* Compute and store the corrected new coordinate */
 664         for (int d = 0; d < DIM; d++)
 665         {
 666             xprime_ow1[d] = com[d] + a3[d];
 667         }
 668         for (int d = 0; d < DIM; d++)
 669         {
 670             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
 671         }
 672         for (int d = 0; d < DIM; d++)
 673         {
 674             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
 675         }
 676         /* 9 flops + 6 pbc flops */
 677
 678         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
 679         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
 680         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
 681
 682         // cppcheck-suppress duplicateExpression
 683         if (bCorrectVelocity || bCalcVirial)
 684         {
 685             T da[DIM], db[DIM], dc[DIM];
 686             for (int d = 0; d < DIM; d++)
 687             {
 688                 da[d] = a3[d] - a1[d];
 689             }
 690             for (int d = 0; d < DIM; d++)
 691             {
 692                 db[d] = b3[d] - b1[d];
 693             }
 694             for (int d = 0; d < DIM; d++)
 695             {
 696                 dc[d] = c3[d] - c1[d];
 697             }
 698             /* 9 flops */
 699
 700             if (bCorrectVelocity)
 701             {
 702                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
 703
 704                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
 705                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
 706                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
 707
 708                 /* Add the position correction divided by dt to the velocity */
 709                 for (int d = 0; d < DIM; d++)
 710                 {
 711                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
 712                 }
 713                 for (int d = 0; d < DIM; d++)
 714                 {
 715                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
 716                 }
 717                 for (int d = 0; d < DIM; d++)
 718                 {
 719                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
 720                 }
 721                 /* 3*6 flops */
 722
 723                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
 724                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
 725                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
 726             }
 727
 728             if (bCalcVirial)
 729             {
 730                 /* Filter out the non-local settles */
 731                 T filter = load<T>(settled->virfac + i);
 732                 T mOf    = filter*mO;
 733                 T mHf    = filter*mH;
 734
 735                 T mdo[DIM], mdb[DIM], mdc[DIM];
 736
 737                 for (int d = 0; d < DIM; d++)
 738                 {
 739                     mdb[d] = mHf*db[d];
 740                     mdc[d] = mHf*dc[d];
 741                     mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
 742                 }
 743
 744                 for (int d2 = 0; d2 < DIM; d2++)
 745                 {
 746                     for (int d = 0; d < DIM; d++)
 747                     {
 748                         sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
 749                             (x_ow1[d2]*mdo[d] +
 750                              dist21[d2]*mdb[d] +
 751                              dist31[d2]*mdc[d]);
 752                     }
 753                 }
 754                 /* 71 flops */
 755             }
 756         }
 757     }
 758
 759     if (bCalcVirial)
 760     {
 761         for (int d2 = 0; d2 < DIM; d2++)
 762         {
 763             for (int d = 0; d < DIM; d++)
 764             {
 765                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
 766             }
 767         }
 768     }
 769
 770     *bErrorHasOccurred = anyTrue(bError);
 771 }
 772
 773 /*! \brief Wrapper template function that divides the settles over threads
 774  * and instantiates the core template with instantiated booleans.
 775  */
 776 template<typename T, typename TypeBool, int packSize, typename TypePbc>
 777 static void settleTemplateWrapper(settledata *settled,
 778                                   int nthread, int thread,
 779                                   TypePbc pbc,
 780                                   const real x[], real xprime[],
 781                                   real invdt, real *v,
 782                                   bool bCalcVirial, tensor vir_r_m_dr,
 783                                   bool *bErrorHasOccurred)
 784 {
 785     /* We need to assign settles to threads in groups of pack_size */
 786     int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
 787     /* Round the end value up to give thread 0 more work */
 788     int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
 789     int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
 790
 791     if (v != nullptr)
 792     {
 793         if (!bCalcVirial)
 794         {
 795             settleTemplate<T, TypeBool, packSize,
 796                            TypePbc,
 797                            true,
 798                            false>
 799                 (settled, settleStart, settleEnd,
 800                 pbc,
 801                 x, xprime,
 802                 invdt, v,
 803                 nullptr,
 804                 bErrorHasOccurred);
 805         }
 806         else
 807         {
 808             settleTemplate<T, TypeBool, packSize,
 809                            TypePbc,
 810                            true,
 811                            true>
 812                 (settled, settleStart, settleEnd,
 813                 pbc,
 814                 x, xprime,
 815                 invdt, v,
 816                 vir_r_m_dr,
 817                 bErrorHasOccurred);
 818         }
 819     }
 820     else
 821     {
 822         if (!bCalcVirial)
 823         {
 824             settleTemplate<T, TypeBool, packSize,
 825                            TypePbc,
 826                            false,
 827                            false>
 828                 (settled, settleStart, settleEnd,
 829                 pbc,
 830                 x, xprime,
 831                 invdt, v,
 832                 nullptr,
 833                 bErrorHasOccurred);
 834         }
 835         else
 836         {
 837             settleTemplate<T, TypeBool, packSize,
 838                            TypePbc,
 839                            false,
 840                            true>
 841                 (settled, settleStart, settleEnd,
 842                 pbc,
 843                 x, xprime,
 844                 invdt, v,
 845                 vir_r_m_dr,
 846                 bErrorHasOccurred);
 847         }
 848     }
 849 }
 850
 851 void csettle(settledata *settled,
 852              int nthread, int thread,
 853              const t_pbc *pbc,
 854              const real x[], real xprime[],
 855              real invdt, real *v,
 856              bool bCalcVirial, tensor vir_r_m_dr,
 857              bool *bErrorHasOccurred)
 858 {
 859 #if GMX_SIMD_HAVE_REAL
 860     if (settled->bUseSimd)
 861     {
 862         /* Convert the pbc struct for SIMD */
 863         alignas(GMX_SIMD_ALIGNMENT) real    pbcSimd[9*GMX_SIMD_REAL_WIDTH];
 864         set_pbc_simd(pbc, pbcSimd);
 865
 866         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
 867                               const real *>(settled,
 868                                             nthread, thread,
 869                                             pbcSimd,
 870                                             x, xprime,
 871                                             invdt,
 872                                             v,
 873                                             bCalcVirial, vir_r_m_dr,
 874                                             bErrorHasOccurred);
 875     }
 876     else
 877 #endif
 878     {
 879         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
 880         t_pbc        pbcNo;
 881         const t_pbc *pbcNonNull;
 882
 883         if (pbc != nullptr)
 884         {
 885             pbcNonNull = pbc;
 886         }
 887         else
 888         {
 889             set_pbc(&pbcNo, epbcNONE, nullptr);
 890             pbcNonNull = &pbcNo;
 891         }
 892
 893         settleTemplateWrapper<real, bool, 1,
 894                               const t_pbc *>(settled,
 895                                              nthread, thread,
 896                                              pbcNonNull,
 897                                              x, xprime,
 898                                              invdt,
 899                                              v,
 900                                              bCalcVirial, vir_r_m_dr,
 901                                              bErrorHasOccurred);
 902     }
 903 }
 904
 905 } // namespace