Change gmx_moltype_t.ilist to InteractionLists
[alexxy/gromacs.git] / src / gromacs / mdlib / settle.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  * \brief Defines SETTLE code.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \author Mark Abraham <mark.j.abraham@gmail.com>
42  * \ingroup module_mdlib
43  */
44 #include "gmxpre.h"
45
46 #include "settle.h"
47
48 #include <cassert>
49 #include <cmath>
50 #include <cstdio>
51
52 #include <algorithm>
53
54 #include "gromacs/math/functions.h"
55 #include "gromacs/math/invertmatrix.h"
56 #include "gromacs/math/vec.h"
57 #include "gromacs/mdlib/constr.h"
58 #include "gromacs/mdtypes/mdatom.h"
59 #include "gromacs/pbcutil/ishift.h"
60 #include "gromacs/pbcutil/pbc.h"
61 #include "gromacs/pbcutil/pbc-simd.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/simd/simd_math.h"
64 #include "gromacs/topology/idef.h"
65 #include "gromacs/topology/ifunc.h"
66 #include "gromacs/topology/mtop_util.h"
67 #include "gromacs/utility/fatalerror.h"
68 #include "gromacs/utility/gmxassert.h"
69 #include "gromacs/utility/smalloc.h"
70
71 namespace gmx
72 {
73
74 struct settleparam_t
75 {
76     real   mO;
77     real   mH;
78     real   wh;
79     real   dOH;
80     real   dHH;
81     real   ra;
82     real   rb;
83     real   rc;
84     real   irc2;
85     /* For projection */
86     real   imO;
87     real   imH;
88     real   invdOH;
89     real   invdHH;
90     matrix invmat;
91 };
92
93 struct settledata
94 {
95     settleparam_t massw;    /* Parameters for SETTLE for coordinates */
96     settleparam_t mass1;    /* Parameters with all masses 1, for forces */
97
98     int           nsettle;  /* The number of settles on our rank */
99     int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
100     int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
101     int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
102     real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
103     int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
104
105     bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
106 };
107
108
109 //! Initializes a projection matrix.
110 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH,
111                              matrix inverseCouplingMatrix)
112 {
113     /* We normalize the inverse masses with invmO for the matrix inversion.
114      * so we can keep using masses of almost zero for frozen particles,
115      * without running out of the float range in invertMatrix.
116      */
117     double invmORelative = 1.0;
118     double invmHRelative = invmH/static_cast<double>(invmO);
119     double distanceRatio = dHH/static_cast<double>(dOH);
120
121     /* Construct the constraint coupling matrix */
122     matrix mat;
123     mat[0][0] = invmORelative + invmHRelative;
124     mat[0][1] = invmORelative*(1.0 - 0.5*gmx::square(distanceRatio));
125     mat[0][2] = invmHRelative*0.5*distanceRatio;
126     mat[1][1] = mat[0][0];
127     mat[1][2] = mat[0][2];
128     mat[2][2] = invmHRelative + invmHRelative;
129     mat[1][0] = mat[0][1];
130     mat[2][0] = mat[0][2];
131     mat[2][1] = mat[1][2];
132
133     invertMatrix(mat, inverseCouplingMatrix);
134
135     msmul(inverseCouplingMatrix, 1/invmO, inverseCouplingMatrix);
136 }
137
138 //! Initializes settle parameters.
139 static void settleparam_init(settleparam_t *p,
140                              real mO, real mH, real invmO, real invmH,
141                              real dOH, real dHH)
142 {
143     /* We calculate parameters in double precision to minimize errors.
144      * The velocity correction applied during SETTLE coordinate constraining
145      * introduces a systematic error of approximately 1 bit per atom,
146      * depending on what the compiler does with the code.
147      */
148     double wohh;
149
150     p->mO     = mO;
151     p->mH     = mH;
152     wohh      = mO + 2.0*mH;
153     p->wh     = mH/wohh;
154     p->dOH    = dOH;
155     p->dHH    = dHH;
156     double rc = dHH/2.0;
157     double ra = 2.0*mH*std::sqrt(dOH*dOH - rc*rc)/wohh;
158     p->rb     = std::sqrt(dOH*dOH - rc*rc) - ra;
159     p->rc     = rc;
160     p->ra     = ra;
161     p->irc2   = 1.0/dHH;
162
163     /* For projection: inverse masses and coupling matrix inversion */
164     p->imO    = invmO;
165     p->imH    = invmH;
166
167     p->invdOH = 1.0/dOH;
168     p->invdHH = 1.0/dHH;
169
170     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
171
172     if (debug)
173     {
174         fprintf(debug, "wh =%g, rc = %g, ra = %g\n",
175                 p->wh, p->rc, p->ra);
176         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
177                 p->rb, p->irc2, p->dHH, p->dOH);
178     }
179 }
180
181 settledata *settle_init(const gmx_mtop_t &mtop)
182 {
183     /* Check that we have only one settle type */
184     int                    settle_type = -1;
185     gmx_mtop_ilistloop_t   iloop       = gmx_mtop_ilistloop_init(mtop);
186     int                    nmol;
187     const int              nral1       = 1 + NRAL(F_SETTLE);
188     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
189     {
190         const InteractionList &ilist = (*ilists)[F_SETTLE];
191         for (int i = 0; i < ilist.size(); i += nral1)
192         {
193             if (settle_type == -1)
194             {
195                 settle_type = ilist.iatoms[i];
196             }
197             else if (ilist.iatoms[i] != settle_type)
198             {
199                 gmx_fatal(FARGS,
200                           "The [molecules] section of your topology specifies more than one block of\n"
201                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
202                           "If you are trying to partition your solvent into different *groups*\n"
203                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
204                           "files specify groups. Otherwise, you may wish to change the least-used\n"
205                           "block of molecules with SETTLE constraints into 3 normal constraints.");
206             }
207         }
208     }
209     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
210
211     settledata *settled;
212
213     snew(settled, 1);
214
215     /* We will not initialize the normal SETTLE parameters here yet,
216      * since the atom (inv)masses can depend on the integrator and
217      * free-energy perturbation. We set mO=-1 to trigger later initialization.
218      */
219     settled->massw.mO = -1;
220
221     real dOH = mtop.ffparams.iparams[settle_type].settle.doh;
222     real dHH = mtop.ffparams.iparams[settle_type].settle.dhh;
223     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
224
225     settled->ow1    = nullptr;
226     settled->hw2    = nullptr;
227     settled->hw3    = nullptr;
228     settled->virfac = nullptr;
229     settled->nalloc = 0;
230
231     /* Without SIMD configured, this bool is not used */
232     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr);
233
234     return settled;
235 }
236
237 void settle_free(settledata *settled)
238 {
239     sfree_aligned(settled->ow1);
240     sfree_aligned(settled->hw2);
241     sfree_aligned(settled->hw3);
242     sfree_aligned(settled->virfac);
243     sfree(settled);
244 }
245
246 void settle_set_constraints(settledata       *settled,
247                             const t_ilist    *il_settle,
248                             const t_mdatoms  &mdatoms)
249 {
250 #if GMX_SIMD_HAVE_REAL
251     const int pack_size = GMX_SIMD_REAL_WIDTH;
252 #else
253     const int pack_size = 1;
254 #endif
255
256     const int nral1     = 1 + NRAL(F_SETTLE);
257     int       nsettle   = il_settle->nr/nral1;
258     settled->nsettle = nsettle;
259
260     if (nsettle > 0)
261     {
262         const t_iatom *iatoms = il_settle->iatoms;
263
264         /* Here we initialize the normal SETTLE parameters */
265         if (settled->massw.mO < 0)
266         {
267             int firstO = iatoms[1];
268             int firstH = iatoms[2];
269             settleparam_init(&settled->massw,
270                              mdatoms.massT[firstO],
271                              mdatoms.massT[firstH],
272                              mdatoms.invmass[firstO],
273                              mdatoms.invmass[firstH],
274                              settled->mass1.dOH,
275                              settled->mass1.dHH);
276         }
277
278         if (nsettle + pack_size > settled->nalloc)
279         {
280             settled->nalloc = over_alloc_dd(nsettle + pack_size);
281             sfree_aligned(settled->ow1);
282             sfree_aligned(settled->hw2);
283             sfree_aligned(settled->hw3);
284             sfree_aligned(settled->virfac);
285             snew_aligned(settled->ow1, settled->nalloc, 64);
286             snew_aligned(settled->hw2, settled->nalloc, 64);
287             snew_aligned(settled->hw3, settled->nalloc, 64);
288             snew_aligned(settled->virfac, settled->nalloc, 64);
289         }
290
291         for (int i = 0; i < nsettle; i++)
292         {
293             settled->ow1[i]    = iatoms[i*nral1 + 1];
294             settled->hw2[i]    = iatoms[i*nral1 + 2];
295             settled->hw3[i]    = iatoms[i*nral1 + 3];
296             /* We should avoid double counting of virial contributions for
297              * SETTLEs that appear in multiple DD domains, so we only count
298              * the contribution on the home range of the oxygen atom.
299              */
300             settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms.homenr ? 1 : 0);
301         }
302
303         /* Pack the index array to the full SIMD width with copies from
304          * the last normal entry, but with no virial contribution.
305          */
306         int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
307         for (int i = nsettle; i < end_packed; i++)
308         {
309             settled->ow1[i]    = settled->ow1[nsettle - 1];
310             settled->hw2[i]    = settled->hw2[nsettle - 1];
311             settled->hw3[i]    = settled->hw3[nsettle - 1];
312             settled->virfac[i] = 0;
313         }
314     }
315 }
316
317 void settle_proj(settledata *settled, ConstraintVariable econq,
318                  int nsettle, const t_iatom iatoms[],
319                  const t_pbc *pbc,
320                  const rvec x[],
321                  rvec *der, rvec *derp,
322                  int calcvir_atom_end, tensor vir_r_m_dder)
323 {
324     /* Settle for projection out constraint components
325      * of derivatives of the coordinates.
326      * Berk Hess 2008-1-10
327      */
328
329     settleparam_t *p;
330     real           imO, imH, dOH, dHH, invdOH, invdHH;
331     matrix         invmat;
332     int            i, m, m2, ow1, hw2, hw3;
333     rvec           roh2, roh3, rhh, dc, fc;
334
335     calcvir_atom_end *= DIM;
336
337     if (econq == ConstraintVariable::Force)
338     {
339         p = &settled->mass1;
340     }
341     else
342     {
343         p = &settled->massw;
344     }
345     imO    = p->imO;
346     imH    = p->imH;
347     copy_mat(p->invmat, invmat);
348     dOH    = p->dOH;
349     dHH    = p->dHH;
350     invdOH = p->invdOH;
351     invdHH = p->invdHH;
352
353     const int nral1 = 1 + NRAL(F_SETTLE);
354
355     for (i = 0; i < nsettle; i++)
356     {
357         ow1 = iatoms[i*nral1 + 1];
358         hw2 = iatoms[i*nral1 + 2];
359         hw3 = iatoms[i*nral1 + 3];
360
361         if (pbc == nullptr)
362         {
363             rvec_sub(x[ow1], x[hw2], roh2);
364             rvec_sub(x[ow1], x[hw3], roh3);
365             rvec_sub(x[hw2], x[hw3], rhh);
366         }
367         else
368         {
369             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
370             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
371             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
372         }
373         svmul(invdOH, roh2, roh2);
374         svmul(invdOH, roh3, roh3);
375         svmul(invdHH, rhh, rhh);
376         /* 18 flops */
377
378         /* Determine the projections of der on the bonds */
379         clear_rvec(dc);
380         for (m = 0; m < DIM; m++)
381         {
382             dc[0] += (der[ow1][m] - der[hw2][m])*roh2[m];
383             dc[1] += (der[ow1][m] - der[hw3][m])*roh3[m];
384             dc[2] += (der[hw2][m] - der[hw3][m])*rhh [m];
385         }
386         /* 27 flops */
387
388         /* Determine the correction for the three bonds */
389         mvmul(invmat, dc, fc);
390         /* 15 flops */
391
392         /* Subtract the corrections from derp */
393         for (m = 0; m < DIM; m++)
394         {
395             derp[ow1][m] -= imO*( fc[0]*roh2[m] + fc[1]*roh3[m]);
396             derp[hw2][m] -= imH*(-fc[0]*roh2[m] + fc[2]*rhh [m]);
397             derp[hw3][m] -= imH*(-fc[1]*roh3[m] - fc[2]*rhh [m]);
398         }
399
400         /* 45 flops */
401
402         if (ow1 < calcvir_atom_end)
403         {
404             /* Determining r \dot m der is easy,
405              * since fc contains the mass weighted corrections for der.
406              */
407
408             for (m = 0; m < DIM; m++)
409             {
410                 for (m2 = 0; m2 < DIM; m2++)
411                 {
412                     vir_r_m_dder[m][m2] +=
413                         dOH*roh2[m]*roh2[m2]*fc[0] +
414                         dOH*roh3[m]*roh3[m2]*fc[1] +
415                         dHH*rhh [m]*rhh [m2]*fc[2];
416                 }
417             }
418         }
419     }
420 }
421
422
423 /*! \brief The actual settle code, templated for real/SimdReal and for optimization */
424 template<typename T, typename TypeBool, int packSize,
425          typename TypePbc,
426          bool bCorrectVelocity,
427          bool bCalcVirial>
428 static void settleTemplate(const settledata *settled,
429                            int settleStart, int settleEnd,
430                            const TypePbc pbc,
431                            const real *x, real *xprime,
432                            real invdt, real * gmx_restrict v,
433                            tensor vir_r_m_dr,
434                            bool *bErrorHasOccurred)
435 {
436     /* ******************************************************************* */
437     /*                                                                  ** */
438     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
439     /*                                                                  ** */
440     /*    Algorithm changes by Berk Hess:                               ** */
441     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
442     /*    2006-10-16 Changed velocity update to use differences         ** */
443     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
444     /*    2016-02    Complete rewrite of the code for SIMD              ** */
445     /*                                                                  ** */
446     /*    Reference for the SETTLE algorithm                            ** */
447     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
448     /*                                                                  ** */
449     /* ******************************************************************* */
450
451     assert(settleStart % packSize == 0);
452     assert(settleEnd   % packSize == 0);
453
454     TypeBool             bError = TypeBool(false);
455
456     const settleparam_t *p    = &settled->massw;
457     T                    wh   = T(p->wh);
458     T                    rc   = T(p->rc);
459     T                    ra   = T(p->ra);
460     T                    rb   = T(p->rb);
461     T                    irc2 = T(p->irc2);
462     T                    mO   = T(p->mO);
463     T                    mH   = T(p->mH);
464
465     T                    almost_zero = T(1e-12);
466
467     T                    sum_r_m_dr[DIM][DIM];
468
469     if (bCalcVirial)
470     {
471         for (int d2 = 0; d2 < DIM; d2++)
472         {
473             for (int d = 0; d < DIM; d++)
474             {
475                 sum_r_m_dr[d2][d] = T(0);
476             }
477         }
478     }
479
480     for (int i = settleStart; i < settleEnd; i += packSize)
481     {
482         /* Here we pad up to packSize with copies from the last valid entry.
483          * This gives correct results, since we store (not increment) all
484          * output, so we store the same output multiple times.
485          */
486         const int *ow1 = settled->ow1 + i;
487         const int *hw2 = settled->hw2 + i;
488         const int *hw3 = settled->hw3 + i;
489
490         T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
491
492         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
493         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
494         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
495
496         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
497
498         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
499         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
500         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
501
502         T dist21[DIM], dist31[DIM];
503         T doh2[DIM], doh3[DIM];
504         T sh_hw2[DIM], sh_hw3[DIM];
505
506         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
507
508         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
509
510         /* Tedious way of doing pbc */
511         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
512         for (int d = 0; d < DIM; d++)
513         {
514             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
515             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
516         }
517         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
518         for (int d = 0; d < DIM; d++)
519         {
520             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
521             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
522         }
523
524         /* Not calculating the center of mass using the oxygen position
525          * and the O-H distances, as done below, will make SETTLE
526          * the largest source of energy drift for simulations of water,
527          * as then the oxygen coordinate is multiplied by 0.89 at every step,
528          * which can then transfer a systematic rounding to the oxygen velocity.
529          */
530         T a1[DIM], com[DIM];
531         for (int d = 0; d < DIM; d++)
532         {
533             a1[d]  = -(doh2[d] + doh3[d]) * wh;
534             com[d] = xprime_ow1[d] - a1[d];
535         }
536         T b1[DIM];
537         for (int d = 0; d < DIM; d++)
538         {
539             b1[d]  = xprime_hw2[d] - com[d];
540         }
541         T c1[DIM];
542         for (int d = 0; d < DIM; d++)
543         {
544             c1[d]  = xprime_hw3[d] - com[d];
545         }
546         /* 15 flops */
547
548         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
549         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
550         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
551         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
552         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
553         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
554         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
555         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
556         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
557         /* 27 flops */
558
559         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
560         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
561         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
562
563         T trns1[DIM], trns2[DIM], trns3[DIM];
564
565         trns1[XX] = xaksxd * axlng;
566         trns2[XX] = yaksxd * axlng;
567         trns3[XX] = zaksxd * axlng;
568         trns1[YY] = xaksyd * aylng;
569         trns2[YY] = yaksyd * aylng;
570         trns3[YY] = zaksyd * aylng;
571         trns1[ZZ] = xakszd * azlng;
572         trns2[ZZ] = yakszd * azlng;
573         trns3[ZZ] = zakszd * azlng;
574         /* 24 flops */
575
576         T b0d[2], c0d[2];
577
578         for (int d = 0; d < 2; d++)
579         {
580             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
581             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
582         }
583
584         T a1d_z, b1d[DIM], c1d[DIM];
585
586         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
587         for (int d = 0; d < DIM; d++)
588         {
589             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
590             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
591         }
592         /* 65 flops */
593
594         T tmp, tmp2;
595
596         T sinphi = a1d_z * gmx::invsqrt(ra*ra);
597         tmp2     = 1.0 - sinphi * sinphi;
598
599         /* If tmp2 gets close to or beyond zero we have severly distorted
600          * water molecules and we should terminate the simulation.
601          * Below we take the max with almost_zero to continue the loop.
602          */
603         bError   = bError || (tmp2 <= almost_zero);
604
605         tmp2     = max(tmp2, almost_zero);
606         tmp      = gmx::invsqrt(tmp2);
607         T cosphi = tmp2*tmp;
608         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
609         tmp2     = 1.0 - sinpsi * sinpsi;
610
611         T cospsi = tmp2*gmx::invsqrt(tmp2);
612         /* 46 flops */
613
614         T a2d_y  =  ra * cosphi;
615         T b2d_x  = -rc * cospsi;
616         T t1     = -rb * cosphi;
617         T t2     =  rc * sinpsi * sinphi;
618         T b2d_y  =  t1 - t2;
619         T c2d_y  =  t1 + t2;
620         /* 7 flops */
621
622         /*     --- Step3  al,be,ga            --- */
623         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
624         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
625         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
626         T al2be2 = alpha * alpha + beta * beta;
627         tmp2     = (al2be2 - gamma * gamma);
628         T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
629         /* 47 flops */
630
631         /*  --- Step4  A3' --- */
632         tmp2     = 1.0 - sinthe * sinthe;
633         T costhe = tmp2*gmx::invsqrt(tmp2);
634
635         T a3d[DIM], b3d[DIM], c3d[DIM];
636
637         a3d[XX]  = -a2d_y * sinthe;
638         a3d[YY]  = a2d_y * costhe;
639         a3d[ZZ]  = a1d_z;
640         b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
641         b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
642         b3d[ZZ]  = b1d[ZZ];
643         c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
644         c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
645         c3d[ZZ]  = c1d[ZZ];
646         /* 26 flops */
647
648         /*    --- Step5  A3 --- */
649         T a3[DIM], b3[DIM], c3[DIM];
650
651         a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
652         a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
653         a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
654         b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
655         b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
656         b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
657         c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
658         c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
659         c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
660         /* 45 flops */
661
662         /* Compute and store the corrected new coordinate */
663         for (int d = 0; d < DIM; d++)
664         {
665             xprime_ow1[d] = com[d] + a3[d];
666         }
667         for (int d = 0; d < DIM; d++)
668         {
669             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
670         }
671         for (int d = 0; d < DIM; d++)
672         {
673             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
674         }
675         /* 9 flops + 6 pbc flops */
676
677         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
678         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
679         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
680
681         if (bCorrectVelocity || bCalcVirial)
682         {
683             T da[DIM], db[DIM], dc[DIM];
684             for (int d = 0; d < DIM; d++)
685             {
686                 da[d] = a3[d] - a1[d];
687             }
688             for (int d = 0; d < DIM; d++)
689             {
690                 db[d] = b3[d] - b1[d];
691             }
692             for (int d = 0; d < DIM; d++)
693             {
694                 dc[d] = c3[d] - c1[d];
695             }
696             /* 9 flops */
697
698             if (bCorrectVelocity)
699             {
700                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
701
702                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
703                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
704                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
705
706                 /* Add the position correction divided by dt to the velocity */
707                 for (int d = 0; d < DIM; d++)
708                 {
709                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
710                 }
711                 for (int d = 0; d < DIM; d++)
712                 {
713                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
714                 }
715                 for (int d = 0; d < DIM; d++)
716                 {
717                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
718                 }
719                 /* 3*6 flops */
720
721                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
722                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
723                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
724             }
725
726             if (bCalcVirial)
727             {
728                 /* Filter out the non-local settles */
729                 T filter = load<T>(settled->virfac + i);
730                 T mOf    = filter*mO;
731                 T mHf    = filter*mH;
732
733                 T mdo[DIM], mdb[DIM], mdc[DIM];
734
735                 for (int d = 0; d < DIM; d++)
736                 {
737                     mdb[d] = mHf*db[d];
738                     mdc[d] = mHf*dc[d];
739                     mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
740                 }
741
742                 for (int d2 = 0; d2 < DIM; d2++)
743                 {
744                     for (int d = 0; d < DIM; d++)
745                     {
746                         sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
747                             (x_ow1[d2]*mdo[d] +
748                              dist21[d2]*mdb[d] +
749                              dist31[d2]*mdc[d]);
750                     }
751                 }
752                 /* 71 flops */
753             }
754         }
755     }
756
757     if (bCalcVirial)
758     {
759         for (int d2 = 0; d2 < DIM; d2++)
760         {
761             for (int d = 0; d < DIM; d++)
762             {
763                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
764             }
765         }
766     }
767
768     *bErrorHasOccurred = anyTrue(bError);
769 }
770
771 /*! \brief Wrapper template function that divides the settles over threads
772  * and instantiates the core template with instantiated booleans.
773  */
774 template<typename T, typename TypeBool, int packSize, typename TypePbc>
775 static void settleTemplateWrapper(settledata *settled,
776                                   int nthread, int thread,
777                                   TypePbc pbc,
778                                   const real x[], real xprime[],
779                                   real invdt, real *v,
780                                   bool bCalcVirial, tensor vir_r_m_dr,
781                                   bool *bErrorHasOccurred)
782 {
783     /* We need to assign settles to threads in groups of pack_size */
784     int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
785     /* Round the end value up to give thread 0 more work */
786     int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
787     int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
788
789     if (v != nullptr)
790     {
791         if (!bCalcVirial)
792         {
793             settleTemplate<T, TypeBool, packSize,
794                            TypePbc,
795                            true,
796                            false>
797                 (settled, settleStart, settleEnd,
798                 pbc,
799                 x, xprime,
800                 invdt, v,
801                 nullptr,
802                 bErrorHasOccurred);
803         }
804         else
805         {
806             settleTemplate<T, TypeBool, packSize,
807                            TypePbc,
808                            true,
809                            true>
810                 (settled, settleStart, settleEnd,
811                 pbc,
812                 x, xprime,
813                 invdt, v,
814                 vir_r_m_dr,
815                 bErrorHasOccurred);
816         }
817     }
818     else
819     {
820         if (!bCalcVirial)
821         {
822             settleTemplate<T, TypeBool, packSize,
823                            TypePbc,
824                            false,
825                            false>
826                 (settled, settleStart, settleEnd,
827                 pbc,
828                 x, xprime,
829                 invdt, v,
830                 nullptr,
831                 bErrorHasOccurred);
832         }
833         else
834         {
835             settleTemplate<T, TypeBool, packSize,
836                            TypePbc,
837                            false,
838                            true>
839                 (settled, settleStart, settleEnd,
840                 pbc,
841                 x, xprime,
842                 invdt, v,
843                 vir_r_m_dr,
844                 bErrorHasOccurred);
845         }
846     }
847 }
848
849 void csettle(settledata *settled,
850              int nthread, int thread,
851              const t_pbc *pbc,
852              const real x[], real xprime[],
853              real invdt, real *v,
854              bool bCalcVirial, tensor vir_r_m_dr,
855              bool *bErrorHasOccurred)
856 {
857 #if GMX_SIMD_HAVE_REAL
858     if (settled->bUseSimd)
859     {
860         /* Convert the pbc struct for SIMD */
861         alignas(GMX_SIMD_ALIGNMENT) real    pbcSimd[9*GMX_SIMD_REAL_WIDTH];
862         set_pbc_simd(pbc, pbcSimd);
863
864         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
865                               const real *>(settled,
866                                             nthread, thread,
867                                             pbcSimd,
868                                             x, xprime,
869                                             invdt,
870                                             v,
871                                             bCalcVirial, vir_r_m_dr,
872                                             bErrorHasOccurred);
873     }
874     else
875 #endif
876     {
877         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
878         t_pbc        pbcNo;
879         const t_pbc *pbcNonNull;
880
881         if (pbc != nullptr)
882         {
883             pbcNonNull = pbc;
884         }
885         else
886         {
887             set_pbc(&pbcNo, epbcNONE, nullptr);
888             pbcNonNull = &pbcNo;
889         }
890
891         settleTemplateWrapper<real, bool, 1,
892                               const t_pbc *>(settled,
893                                              nthread, thread,
894                                              pbcNonNull,
895                                              x, xprime,
896                                              invdt,
897                                              v,
898                                              bCalcVirial, vir_r_m_dr,
899                                              bErrorHasOccurred);
900     }
901 }
902
903 }  // namespace gmx