Add cool quote
[alexxy/gromacs.git] / src / gromacs / mdlib / csettle.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <assert.h>
40 #include <stdio.h>
41
42 #include <cmath>
43
44 #include <algorithm>
45
46 #include "gromacs/math/functions.h"
47 #include "gromacs/math/invertmatrix.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/mdlib/constr.h"
50 #include "gromacs/mdtypes/mdatom.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/pbcutil/pbc.h"
53 #include "gromacs/pbcutil/pbc-simd.h"
54 #include "gromacs/simd/simd.h"
55 #include "gromacs/simd/simd_math.h"
56 #include "gromacs/topology/mtop_util.h"
57 #include "gromacs/utility/fatalerror.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/smalloc.h"
60
61 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
62
63 typedef struct
64 {
65     real   mO;
66     real   mH;
67     real   wh;
68     real   dOH;
69     real   dHH;
70     real   ra;
71     real   rb;
72     real   rc;
73     real   irc2;
74     /* For projection */
75     real   imO;
76     real   imH;
77     real   invdOH;
78     real   invdHH;
79     matrix invmat;
80 } settleparam_t;
81
82 typedef struct gmx_settledata
83 {
84     settleparam_t massw;    /* Parameters for SETTLE for coordinates */
85     settleparam_t mass1;    /* Parameters with all masses 1, for forces */
86
87     int           nsettle;  /* The number of settles on our rank */
88     int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
89     int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
90     int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
91     real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
92     int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
93
94     bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
95 } t_gmx_settledata;
96
97
98 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH,
99                              matrix inverseCouplingMatrix)
100 {
101     /* We normalize the inverse masses with invmO for the matrix inversion.
102      * so we can keep using masses of almost zero for frozen particles,
103      * without running out of the float range in invertMatrix.
104      */
105     double invmORelative = 1.0;
106     double invmHRelative = invmH/static_cast<double>(invmO);
107     double distanceRatio = dHH/static_cast<double>(dOH);
108
109     /* Construct the constraint coupling matrix */
110     matrix mat;
111     mat[0][0] = invmORelative + invmHRelative;
112     mat[0][1] = invmORelative*(1.0 - 0.5*gmx::square(distanceRatio));
113     mat[0][2] = invmHRelative*0.5*distanceRatio;
114     mat[1][1] = mat[0][0];
115     mat[1][2] = mat[0][2];
116     mat[2][2] = invmHRelative + invmHRelative;
117     mat[1][0] = mat[0][1];
118     mat[2][0] = mat[0][2];
119     mat[2][1] = mat[1][2];
120
121     invertMatrix(mat, inverseCouplingMatrix);
122
123     msmul(inverseCouplingMatrix, 1/invmO, inverseCouplingMatrix);
124 }
125
126 static void settleparam_init(settleparam_t *p,
127                              real mO, real mH, real invmO, real invmH,
128                              real dOH, real dHH)
129 {
130     /* We calculate parameters in double precision to minimize errors.
131      * The velocity correction applied during SETTLE coordinate constraining
132      * introduces a systematic error of approximately 1 bit per atom,
133      * depending on what the compiler does with the code.
134      */
135     double wohh;
136
137     p->mO     = mO;
138     p->mH     = mH;
139     wohh      = mO + 2.0*mH;
140     p->wh     = mH/wohh;
141     p->dOH    = dOH;
142     p->dHH    = dHH;
143     double rc = dHH/2.0;
144     double ra = 2.0*mH*std::sqrt(dOH*dOH - rc*rc)/wohh;
145     p->rb     = std::sqrt(dOH*dOH - rc*rc) - ra;
146     p->rc     = rc;
147     p->ra     = ra;
148     p->irc2   = 1.0/dHH;
149
150     /* For projection: inverse masses and coupling matrix inversion */
151     p->imO    = invmO;
152     p->imH    = invmH;
153
154     p->invdOH = 1.0/dOH;
155     p->invdHH = 1.0/dHH;
156
157     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
158
159     if (debug)
160     {
161         fprintf(debug, "wh =%g, rc = %g, ra = %g\n",
162                 p->wh, p->rc, p->ra);
163         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
164                 p->rb, p->irc2, p->dHH, p->dOH);
165     }
166 }
167
168 gmx_settledata_t settle_init(const gmx_mtop_t *mtop)
169 {
170     /* Check that we have only one settle type */
171     int                   settle_type = -1;
172     gmx_mtop_ilistloop_t  iloop       = gmx_mtop_ilistloop_init(mtop);
173     t_ilist              *ilist;
174     int                   nmol;
175     const int             nral1       = 1 + NRAL(F_SETTLE);
176     while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
177     {
178         for (int i = 0; i < ilist[F_SETTLE].nr; i += nral1)
179         {
180             if (settle_type == -1)
181             {
182                 settle_type = ilist[F_SETTLE].iatoms[i];
183             }
184             else if (ilist[F_SETTLE].iatoms[i] != settle_type)
185             {
186                 gmx_fatal(FARGS,
187                           "The [molecules] section of your topology specifies more than one block of\n"
188                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
189                           "If you are trying to partition your solvent into different *groups*\n"
190                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
191                           "files specify groups. Otherwise, you may wish to change the least-used\n"
192                           "block of molecules with SETTLE constraints into 3 normal constraints.");
193             }
194         }
195     }
196     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
197
198     gmx_settledata_t settled;
199
200     snew(settled, 1);
201
202     /* We will not initialize the normal SETTLE parameters here yet,
203      * since the atom (inv)masses can depend on the integrator and
204      * free-energy perturbation. We set mO=-1 to trigger later initialization.
205      */
206     settled->massw.mO = -1;
207
208     real dOH = mtop->ffparams.iparams[settle_type].settle.doh;
209     real dHH = mtop->ffparams.iparams[settle_type].settle.dhh;
210     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
211
212     settled->ow1    = nullptr;
213     settled->hw2    = nullptr;
214     settled->hw3    = nullptr;
215     settled->virfac = nullptr;
216     settled->nalloc = 0;
217
218     /* Without SIMD configured, this bool is not used */
219     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == nullptr);
220
221     return settled;
222 }
223
224 void settle_free(gmx_settledata_t settled)
225 {
226     sfree_aligned(settled->ow1);
227     sfree_aligned(settled->hw2);
228     sfree_aligned(settled->hw3);
229     sfree_aligned(settled->virfac);
230     sfree(settled);
231 }
232
233 void settle_set_constraints(gmx_settledata_t  settled,
234                             const t_ilist    *il_settle,
235                             const t_mdatoms  *mdatoms)
236 {
237 #if GMX_SIMD_HAVE_REAL
238     const int pack_size = GMX_SIMD_REAL_WIDTH;
239 #else
240     const int pack_size = 1;
241 #endif
242
243     const int nral1     = 1 + NRAL(F_SETTLE);
244     int       nsettle   = il_settle->nr/nral1;
245     settled->nsettle = nsettle;
246
247     if (nsettle > 0)
248     {
249         const t_iatom *iatoms = il_settle->iatoms;
250
251         /* Here we initialize the normal SETTLE parameters */
252         if (settled->massw.mO < 0)
253         {
254             int firstO = iatoms[1];
255             int firstH = iatoms[2];
256             settleparam_init(&settled->massw,
257                              mdatoms->massT[firstO],
258                              mdatoms->massT[firstH],
259                              mdatoms->invmass[firstO],
260                              mdatoms->invmass[firstH],
261                              settled->mass1.dOH,
262                              settled->mass1.dHH);
263         }
264
265         if (nsettle + pack_size > settled->nalloc)
266         {
267             settled->nalloc = over_alloc_dd(nsettle + pack_size);
268             sfree_aligned(settled->ow1);
269             sfree_aligned(settled->hw2);
270             sfree_aligned(settled->hw3);
271             sfree_aligned(settled->virfac);
272             snew_aligned(settled->ow1, settled->nalloc, 64);
273             snew_aligned(settled->hw2, settled->nalloc, 64);
274             snew_aligned(settled->hw3, settled->nalloc, 64);
275             snew_aligned(settled->virfac, settled->nalloc, 64);
276         }
277
278         for (int i = 0; i < nsettle; i++)
279         {
280             settled->ow1[i]    = iatoms[i*nral1 + 1];
281             settled->hw2[i]    = iatoms[i*nral1 + 2];
282             settled->hw3[i]    = iatoms[i*nral1 + 3];
283             /* We should avoid double counting of virial contributions for
284              * SETTLEs that appear in multiple DD domains, so we only count
285              * the contribution on the home range of the oxygen atom.
286              */
287             settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms->homenr ? 1 : 0);
288         }
289
290         /* Pack the index array to the full SIMD width with copies from
291          * the last normal entry, but with no virial contribution.
292          */
293         int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
294         for (int i = nsettle; i < end_packed; i++)
295         {
296             settled->ow1[i]    = settled->ow1[nsettle - 1];
297             settled->hw2[i]    = settled->hw2[nsettle - 1];
298             settled->hw3[i]    = settled->hw3[nsettle - 1];
299             settled->virfac[i] = 0;
300         }
301     }
302 }
303
304 void settle_proj(gmx_settledata_t settled, int econq,
305                  int nsettle, t_iatom iatoms[],
306                  const t_pbc *pbc,
307                  rvec x[],
308                  rvec *der, rvec *derp,
309                  int calcvir_atom_end, tensor vir_r_m_dder)
310 {
311     /* Settle for projection out constraint components
312      * of derivatives of the coordinates.
313      * Berk Hess 2008-1-10
314      */
315
316     settleparam_t *p;
317     real           imO, imH, dOH, dHH, invdOH, invdHH;
318     matrix         invmat;
319     int            i, m, m2, ow1, hw2, hw3;
320     rvec           roh2, roh3, rhh, dc, fc;
321
322     calcvir_atom_end *= DIM;
323
324     if (econq == econqForce)
325     {
326         p = &settled->mass1;
327     }
328     else
329     {
330         p = &settled->massw;
331     }
332     imO    = p->imO;
333     imH    = p->imH;
334     copy_mat(p->invmat, invmat);
335     dOH    = p->dOH;
336     dHH    = p->dHH;
337     invdOH = p->invdOH;
338     invdHH = p->invdHH;
339
340 #ifdef PRAGMAS
341 #pragma ivdep
342 #endif
343
344     const int nral1 = 1 + NRAL(F_SETTLE);
345
346     for (i = 0; i < nsettle; i++)
347     {
348         ow1 = iatoms[i*nral1 + 1];
349         hw2 = iatoms[i*nral1 + 2];
350         hw3 = iatoms[i*nral1 + 3];
351
352         if (pbc == nullptr)
353         {
354             rvec_sub(x[ow1], x[hw2], roh2);
355             rvec_sub(x[ow1], x[hw3], roh3);
356             rvec_sub(x[hw2], x[hw3], rhh);
357         }
358         else
359         {
360             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
361             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
362             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
363         }
364         svmul(invdOH, roh2, roh2);
365         svmul(invdOH, roh3, roh3);
366         svmul(invdHH, rhh, rhh);
367         /* 18 flops */
368
369         /* Determine the projections of der on the bonds */
370         clear_rvec(dc);
371         for (m = 0; m < DIM; m++)
372         {
373             dc[0] += (der[ow1][m] - der[hw2][m])*roh2[m];
374             dc[1] += (der[ow1][m] - der[hw3][m])*roh3[m];
375             dc[2] += (der[hw2][m] - der[hw3][m])*rhh [m];
376         }
377         /* 27 flops */
378
379         /* Determine the correction for the three bonds */
380         mvmul(invmat, dc, fc);
381         /* 15 flops */
382
383         /* Subtract the corrections from derp */
384         for (m = 0; m < DIM; m++)
385         {
386             derp[ow1][m] -= imO*( fc[0]*roh2[m] + fc[1]*roh3[m]);
387             derp[hw2][m] -= imH*(-fc[0]*roh2[m] + fc[2]*rhh [m]);
388             derp[hw3][m] -= imH*(-fc[1]*roh3[m] - fc[2]*rhh [m]);
389         }
390
391         /* 45 flops */
392
393         if (ow1 < calcvir_atom_end)
394         {
395             /* Determining r \dot m der is easy,
396              * since fc contains the mass weighted corrections for der.
397              */
398
399             for (m = 0; m < DIM; m++)
400             {
401                 for (m2 = 0; m2 < DIM; m2++)
402                 {
403                     vir_r_m_dder[m][m2] +=
404                         dOH*roh2[m]*roh2[m2]*fc[0] +
405                         dOH*roh3[m]*roh3[m2]*fc[1] +
406                         dHH*rhh [m]*rhh [m2]*fc[2];
407                 }
408             }
409         }
410     }
411 }
412
413
414 /* The actual settle code, templated for real/SimdReal and for optimization */
415 template<typename T, typename TypeBool, int packSize,
416          typename TypePbc,
417          bool bCorrectVelocity,
418          bool bCalcVirial>
419 static void settleTemplate(const gmx_settledata_t settled,
420                            int settleStart, int settleEnd,
421                            const TypePbc pbc,
422                            const real *x, real *xprime,
423                            real invdt, real * gmx_restrict v,
424                            tensor vir_r_m_dr,
425                            bool *bErrorHasOccurred)
426 {
427     /* ******************************************************************* */
428     /*                                                                  ** */
429     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
430     /*                                                                  ** */
431     /*    Algorithm changes by Berk Hess:                               ** */
432     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
433     /*    2006-10-16 Changed velocity update to use differences         ** */
434     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
435     /*    2016-02    Complete rewrite of the code for SIMD              ** */
436     /*                                                                  ** */
437     /*    Reference for the SETTLE algorithm                            ** */
438     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
439     /*                                                                  ** */
440     /* ******************************************************************* */
441
442     assert(settleStart % packSize == 0);
443     assert(settleEnd   % packSize == 0);
444
445     TypeBool       bError = TypeBool(false);
446
447     settleparam_t *p    = &settled->massw;
448     T              wh   = T(p->wh);
449     T              rc   = T(p->rc);
450     T              ra   = T(p->ra);
451     T              rb   = T(p->rb);
452     T              irc2 = T(p->irc2);
453     T              mO   = T(p->mO);
454     T              mH   = T(p->mH);
455
456     T              almost_zero = T(1e-12);
457
458     T              sum_r_m_dr[DIM][DIM];
459
460     if (bCalcVirial)
461     {
462         for (int d2 = 0; d2 < DIM; d2++)
463         {
464             for (int d = 0; d < DIM; d++)
465             {
466                 sum_r_m_dr[d2][d] = T(0);
467             }
468         }
469     }
470
471     for (int i = settleStart; i < settleEnd; i += packSize)
472     {
473         /* Here we pad up to packSize with copies from the last valid entry.
474          * This gives correct results, since we store (not increment) all
475          * output, so we store the same output multiple times.
476          */
477         const int *ow1 = settled->ow1 + i;
478         const int *hw2 = settled->hw2 + i;
479         const int *hw3 = settled->hw3 + i;
480
481         T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
482
483         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
484         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
485         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
486
487         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
488
489         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
490         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
491         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
492
493         T dist21[DIM], dist31[DIM];
494         T doh2[DIM], doh3[DIM];
495         T sh_hw2[DIM], sh_hw3[DIM];
496
497         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
498
499         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
500
501         /* Tedious way of doing pbc */
502         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
503         for (int d = 0; d < DIM; d++)
504         {
505             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
506             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
507         }
508         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
509         for (int d = 0; d < DIM; d++)
510         {
511             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
512             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
513         }
514
515         /* Not calculating the center of mass using the oxygen position
516          * and the O-H distances, as done below, will make SETTLE
517          * the largest source of energy drift for simulations of water,
518          * as then the oxygen coordinate is multiplied by 0.89 at every step,
519          * which can then transfer a systematic rounding to the oxygen velocity.
520          */
521         T a1[DIM], com[DIM];
522         for (int d = 0; d < DIM; d++)
523         {
524             a1[d]  = -(doh2[d] + doh3[d]) * wh;
525             com[d] = xprime_ow1[d] - a1[d];
526         }
527         T b1[DIM];
528         for (int d = 0; d < DIM; d++)
529         {
530             b1[d]  = xprime_hw2[d] - com[d];
531         }
532         T c1[DIM];
533         for (int d = 0; d < DIM; d++)
534         {
535             c1[d]  = xprime_hw3[d] - com[d];
536         }
537         /* 15 flops */
538
539         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
540         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
541         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
542         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
543         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
544         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
545         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
546         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
547         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
548         /* 27 flops */
549
550         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
551         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
552         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
553
554         T trns1[DIM], trns2[DIM], trns3[DIM];
555
556         trns1[XX] = xaksxd * axlng;
557         trns2[XX] = yaksxd * axlng;
558         trns3[XX] = zaksxd * axlng;
559         trns1[YY] = xaksyd * aylng;
560         trns2[YY] = yaksyd * aylng;
561         trns3[YY] = zaksyd * aylng;
562         trns1[ZZ] = xakszd * azlng;
563         trns2[ZZ] = yakszd * azlng;
564         trns3[ZZ] = zakszd * azlng;
565         /* 24 flops */
566
567         T b0d[2], c0d[2];
568
569         for (int d = 0; d < 2; d++)
570         {
571             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
572             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
573         }
574
575         T a1d_z, b1d[DIM], c1d[DIM];
576
577         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
578         for (int d = 0; d < DIM; d++)
579         {
580             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
581             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
582         }
583         /* 65 flops */
584
585         T tmp, tmp2;
586
587         T sinphi = a1d_z * gmx::invsqrt(ra*ra);
588         tmp2     = 1.0 - sinphi * sinphi;
589
590         /* If tmp2 gets close to or beyond zero we have severly distorted
591          * water molecules and we should terminate the simulation.
592          * Below we take the max with almost_zero to continue the loop.
593          */
594         bError   = bError || (tmp2 <= almost_zero);
595
596         tmp2     = max(tmp2, almost_zero);
597         tmp      = gmx::invsqrt(tmp2);
598         T cosphi = tmp2*tmp;
599         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
600         tmp2     = 1.0 - sinpsi * sinpsi;
601
602         T cospsi = tmp2*gmx::invsqrt(tmp2);
603         /* 46 flops */
604
605         T a2d_y  =  ra * cosphi;
606         T b2d_x  = -rc * cospsi;
607         T t1     = -rb * cosphi;
608         T t2     =  rc * sinpsi * sinphi;
609         T b2d_y  =  t1 - t2;
610         T c2d_y  =  t1 + t2;
611         /* 7 flops */
612
613         /*     --- Step3  al,be,ga            --- */
614         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
615         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
616         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
617         T al2be2 = alpha * alpha + beta * beta;
618         tmp2     = (al2be2 - gamma * gamma);
619         T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
620         /* 47 flops */
621
622         /*  --- Step4  A3' --- */
623         tmp2     = 1.0 - sinthe * sinthe;
624         T costhe = tmp2*gmx::invsqrt(tmp2);
625
626         T a3d[DIM], b3d[DIM], c3d[DIM];
627
628         a3d[XX]  = -a2d_y * sinthe;
629         a3d[YY]  = a2d_y * costhe;
630         a3d[ZZ]  = a1d_z;
631         b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
632         b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
633         b3d[ZZ]  = b1d[ZZ];
634         c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
635         c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
636         c3d[ZZ]  = c1d[ZZ];
637         /* 26 flops */
638
639         /*    --- Step5  A3 --- */
640         T a3[DIM], b3[DIM], c3[DIM];
641
642         a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
643         a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
644         a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
645         b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
646         b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
647         b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
648         c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
649         c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
650         c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
651         /* 45 flops */
652
653         /* Compute and store the corrected new coordinate */
654         for (int d = 0; d < DIM; d++)
655         {
656             xprime_ow1[d] = com[d] + a3[d];
657         }
658         for (int d = 0; d < DIM; d++)
659         {
660             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
661         }
662         for (int d = 0; d < DIM; d++)
663         {
664             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
665         }
666         /* 9 flops + 6 pbc flops */
667
668         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
669         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
670         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
671
672         // cppcheck-suppress duplicateExpression
673         if (bCorrectVelocity || bCalcVirial)
674         {
675             T da[DIM], db[DIM], dc[DIM];
676             for (int d = 0; d < DIM; d++)
677             {
678                 da[d] = a3[d] - a1[d];
679             }
680             for (int d = 0; d < DIM; d++)
681             {
682                 db[d] = b3[d] - b1[d];
683             }
684             for (int d = 0; d < DIM; d++)
685             {
686                 dc[d] = c3[d] - c1[d];
687             }
688             /* 9 flops */
689
690             if (bCorrectVelocity)
691             {
692                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
693
694                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
695                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
696                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
697
698                 /* Add the position correction divided by dt to the velocity */
699                 for (int d = 0; d < DIM; d++)
700                 {
701                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
702                 }
703                 for (int d = 0; d < DIM; d++)
704                 {
705                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
706                 }
707                 for (int d = 0; d < DIM; d++)
708                 {
709                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
710                 }
711                 /* 3*6 flops */
712
713                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
714                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
715                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
716             }
717
718             if (bCalcVirial)
719             {
720                 /* Filter out the non-local settles */
721                 T filter = load<T>(settled->virfac + i);
722                 T mOf    = filter*mO;
723                 T mHf    = filter*mH;
724
725                 T mdo[DIM], mdb[DIM], mdc[DIM];
726
727                 for (int d = 0; d < DIM; d++)
728                 {
729                     mdb[d] = mHf*db[d];
730                     mdc[d] = mHf*dc[d];
731                     mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
732                 }
733
734                 for (int d2 = 0; d2 < DIM; d2++)
735                 {
736                     for (int d = 0; d < DIM; d++)
737                     {
738                         sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
739                             (x_ow1[d2]*mdo[d] +
740                              dist21[d2]*mdb[d] +
741                              dist31[d2]*mdc[d]);
742                     }
743                 }
744                 /* 71 flops */
745             }
746         }
747     }
748
749     if (bCalcVirial)
750     {
751         for (int d2 = 0; d2 < DIM; d2++)
752         {
753             for (int d = 0; d < DIM; d++)
754             {
755                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
756             }
757         }
758     }
759
760     *bErrorHasOccurred = anyTrue(bError);
761 }
762
763 /* Wrapper template function that divides the settles over threads
764  * and instantiates the core template with instantiated booleans.
765  */
766 template<typename T, typename TypeBool, int packSize, typename TypePbc>
767 static void settleTemplateWrapper(gmx_settledata_t settled,
768                                   int nthread, int thread,
769                                   TypePbc pbc,
770                                   const real x[], real xprime[],
771                                   real invdt, real *v,
772                                   bool bCalcVirial, tensor vir_r_m_dr,
773                                   bool *bErrorHasOccurred)
774 {
775     /* We need to assign settles to threads in groups of pack_size */
776     int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
777     /* Round the end value up to give thread 0 more work */
778     int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
779     int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
780
781     if (v != nullptr)
782     {
783         if (!bCalcVirial)
784         {
785             settleTemplate<T, TypeBool, packSize,
786                            TypePbc,
787                            true,
788                            false>
789                 (settled, settleStart, settleEnd,
790                 pbc,
791                 x, xprime,
792                 invdt, v,
793                 nullptr,
794                 bErrorHasOccurred);
795         }
796         else
797         {
798             settleTemplate<T, TypeBool, packSize,
799                            TypePbc,
800                            true,
801                            true>
802                 (settled, settleStart, settleEnd,
803                 pbc,
804                 x, xprime,
805                 invdt, v,
806                 vir_r_m_dr,
807                 bErrorHasOccurred);
808         }
809     }
810     else
811     {
812         if (!bCalcVirial)
813         {
814             settleTemplate<T, TypeBool, packSize,
815                            TypePbc,
816                            false,
817                            false>
818                 (settled, settleStart, settleEnd,
819                 pbc,
820                 x, xprime,
821                 invdt, v,
822                 nullptr,
823                 bErrorHasOccurred);
824         }
825         else
826         {
827             settleTemplate<T, TypeBool, packSize,
828                            TypePbc,
829                            false,
830                            true>
831                 (settled, settleStart, settleEnd,
832                 pbc,
833                 x, xprime,
834                 invdt, v,
835                 vir_r_m_dr,
836                 bErrorHasOccurred);
837         }
838     }
839 }
840
841 void csettle(gmx_settledata_t settled,
842              int nthread, int thread,
843              const t_pbc *pbc,
844              const real x[], real xprime[],
845              real invdt, real *v,
846              bool bCalcVirial, tensor vir_r_m_dr,
847              bool *bErrorHasOccurred)
848 {
849 #if GMX_SIMD_HAVE_REAL
850     if (settled->bUseSimd)
851     {
852         /* Convert the pbc struct for SIMD */
853         GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) pbcSimd[9*GMX_SIMD_REAL_WIDTH];
854         set_pbc_simd(pbc, pbcSimd);
855
856         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
857                               const real *>(settled,
858                                             nthread, thread,
859                                             pbcSimd,
860                                             x, xprime,
861                                             invdt,
862                                             v,
863                                             bCalcVirial, vir_r_m_dr,
864                                             bErrorHasOccurred);
865     }
866     else
867 #endif
868     {
869         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
870         t_pbc        pbcNo;
871         const t_pbc *pbcNonNull;
872
873         if (pbc != nullptr)
874         {
875             pbcNonNull = pbc;
876         }
877         else
878         {
879             set_pbc(&pbcNo, epbcNONE, nullptr);
880             pbcNonNull = &pbcNo;
881         }
882
883         settleTemplateWrapper<real, bool, 1,
884                               const t_pbc *>(settled,
885                                              nthread, thread,
886                                              pbcNonNull,
887                                              x, xprime,
888                                              invdt,
889                                              v,
890                                              bCalcVirial, vir_r_m_dr,
891                                              bErrorHasOccurred);
892     }
893 }