74e9d498c3722ad6350ba6f0175a5b82289b1bf7
[alexxy/gromacs.git] / src / gromacs / mdlib / csettle.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <assert.h>
40 #include <math.h>
41 #include <stdio.h>
42
43 #include <algorithm>
44
45 #include "gromacs/math/functions.h"
46 #include "gromacs/math/invertmatrix.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdlib/constr.h"
49 #include "gromacs/mdtypes/mdatom.h"
50 #include "gromacs/pbcutil/ishift.h"
51 #include "gromacs/pbcutil/pbc.h"
52 #include "gromacs/pbcutil/pbc-simd.h"
53 #include "gromacs/simd/simd.h"
54 #include "gromacs/simd/simd_math.h"
55 #include "gromacs/topology/mtop_util.h"
56 #include "gromacs/utility/fatalerror.h"
57 #include "gromacs/utility/gmxassert.h"
58 #include "gromacs/utility/smalloc.h"
59
60 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
61
62 typedef struct
63 {
64     real   mO;
65     real   mH;
66     real   wh;
67     real   dOH;
68     real   dHH;
69     real   ra;
70     real   rb;
71     real   rc;
72     real   irc2;
73     /* For projection */
74     real   imO;
75     real   imH;
76     real   invdOH;
77     real   invdHH;
78     matrix invmat;
79 } settleparam_t;
80
81 typedef struct gmx_settledata
82 {
83     settleparam_t massw;    /* Parameters for SETTLE for coordinates */
84     settleparam_t mass1;    /* Parameters with all masses 1, for forces */
85
86     int           nsettle;  /* The number of settles on our rank */
87     int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
88     int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
89     int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
90     real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
91     int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
92
93     bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
94 } t_gmx_settledata;
95
96
97 static void init_proj_matrix(real invmO, real invmH, real dOH, real dHH,
98                              matrix inverseCouplingMatrix)
99 {
100     /* We normalize the inverse masses with invmO for the matrix inversion.
101      * so we can keep using masses of almost zero for frozen particles,
102      * without running out of the float range in invertMatrix.
103      */
104     double invmORelative = 1.0;
105     double invmHRelative = invmH/static_cast<double>(invmO);
106     double distanceRatio = dHH/static_cast<double>(dOH);
107
108     /* Construct the constraint coupling matrix */
109     matrix mat;
110     mat[0][0] = invmORelative + invmHRelative;
111     mat[0][1] = invmORelative*(1.0 - 0.5*gmx::square(distanceRatio));
112     mat[0][2] = invmHRelative*0.5*distanceRatio;
113     mat[1][1] = mat[0][0];
114     mat[1][2] = mat[0][2];
115     mat[2][2] = invmHRelative + invmHRelative;
116     mat[1][0] = mat[0][1];
117     mat[2][0] = mat[0][2];
118     mat[2][1] = mat[1][2];
119
120     invertMatrix(mat, inverseCouplingMatrix);
121
122     msmul(inverseCouplingMatrix, 1/invmO, inverseCouplingMatrix);
123 }
124
125 static void settleparam_init(settleparam_t *p,
126                              real mO, real mH, real invmO, real invmH,
127                              real dOH, real dHH)
128 {
129     /* We calculate parameters in double precision to minimize errors.
130      * The velocity correction applied during SETTLE coordinate constraining
131      * introduces a systematic error of approximately 1 bit per atom,
132      * depending on what the compiler does with the code.
133      */
134     double wohh;
135
136     p->mO     = mO;
137     p->mH     = mH;
138     wohh      = mO + 2.0*mH;
139     p->wh     = mH/wohh;
140     p->dOH    = dOH;
141     p->dHH    = dHH;
142     double rc = dHH/2.0;
143     double ra = 2.0*mH*std::sqrt(dOH*dOH - rc*rc)/wohh;
144     p->rb     = std::sqrt(dOH*dOH - rc*rc) - ra;
145     p->rc     = rc;
146     p->ra     = ra;
147     p->irc2   = 1.0/dHH;
148
149     /* For projection: inverse masses and coupling matrix inversion */
150     p->imO    = invmO;
151     p->imH    = invmH;
152
153     p->invdOH = 1.0/dOH;
154     p->invdHH = 1.0/dHH;
155
156     init_proj_matrix(invmO, invmH, dOH, dHH, p->invmat);
157
158     if (debug)
159     {
160         fprintf(debug, "wh =%g, rc = %g, ra = %g\n",
161                 p->wh, p->rc, p->ra);
162         fprintf(debug, "rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
163                 p->rb, p->irc2, p->dHH, p->dOH);
164     }
165 }
166
167 gmx_settledata_t settle_init(const gmx_mtop_t *mtop)
168 {
169     /* Check that we have only one settle type */
170     int                   settle_type = -1;
171     gmx_mtop_ilistloop_t  iloop       = gmx_mtop_ilistloop_init(mtop);
172     t_ilist              *ilist;
173     int                   nmol;
174     const int             nral1       = 1 + NRAL(F_SETTLE);
175     while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
176     {
177         for (int i = 0; i < ilist[F_SETTLE].nr; i += nral1)
178         {
179             if (settle_type == -1)
180             {
181                 settle_type = ilist[F_SETTLE].iatoms[i];
182             }
183             else if (ilist[F_SETTLE].iatoms[i] != settle_type)
184             {
185                 gmx_fatal(FARGS,
186                           "The [molecules] section of your topology specifies more than one block of\n"
187                           "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
188                           "If you are trying to partition your solvent into different *groups*\n"
189                           "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
190                           "files specify groups. Otherwise, you may wish to change the least-used\n"
191                           "block of molecules with SETTLE constraints into 3 normal constraints.");
192             }
193         }
194     }
195     GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
196
197     gmx_settledata_t settled;
198
199     snew(settled, 1);
200
201     /* We will not initialize the normal SETTLE parameters here yet,
202      * since the atom (inv)masses can depend on the integrator and
203      * free-energy perturbation. We set mO=-1 to trigger later initialization.
204      */
205     settled->massw.mO = -1;
206
207     real dOH = mtop->ffparams.iparams[settle_type].settle.doh;
208     real dHH = mtop->ffparams.iparams[settle_type].settle.dhh;
209     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
210
211     settled->ow1    = NULL;
212     settled->hw2    = NULL;
213     settled->hw3    = NULL;
214     settled->virfac = NULL;
215     settled->nalloc = 0;
216
217     /* Without SIMD configured, this bool is not used */
218     settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == NULL);
219
220     return settled;
221 }
222
223 void settle_free(gmx_settledata_t settled)
224 {
225     sfree_aligned(settled->ow1);
226     sfree_aligned(settled->hw2);
227     sfree_aligned(settled->hw3);
228     sfree_aligned(settled->virfac);
229     sfree(settled);
230 }
231
232 void settle_set_constraints(gmx_settledata_t  settled,
233                             const t_ilist    *il_settle,
234                             const t_mdatoms  *mdatoms)
235 {
236 #if GMX_SIMD_HAVE_REAL
237     const int pack_size = GMX_SIMD_REAL_WIDTH;
238 #else
239     const int pack_size = 1;
240 #endif
241
242     const int nral1     = 1 + NRAL(F_SETTLE);
243     int       nsettle   = il_settle->nr/nral1;
244     settled->nsettle = nsettle;
245
246     if (nsettle > 0)
247     {
248         const t_iatom *iatoms = il_settle->iatoms;
249
250         /* Here we initialize the normal SETTLE parameters */
251         if (settled->massw.mO < 0)
252         {
253             int firstO = iatoms[1];
254             int firstH = iatoms[2];
255             settleparam_init(&settled->massw,
256                              mdatoms->massT[firstO],
257                              mdatoms->massT[firstH],
258                              mdatoms->invmass[firstO],
259                              mdatoms->invmass[firstH],
260                              settled->mass1.dOH,
261                              settled->mass1.dHH);
262         }
263
264         if (nsettle + pack_size > settled->nalloc)
265         {
266             settled->nalloc = over_alloc_dd(nsettle + pack_size);
267             sfree_aligned(settled->ow1);
268             sfree_aligned(settled->hw2);
269             sfree_aligned(settled->hw3);
270             sfree_aligned(settled->virfac);
271             snew_aligned(settled->ow1, settled->nalloc, 64);
272             snew_aligned(settled->hw2, settled->nalloc, 64);
273             snew_aligned(settled->hw3, settled->nalloc, 64);
274             snew_aligned(settled->virfac, settled->nalloc, 64);
275         }
276
277         for (int i = 0; i < nsettle; i++)
278         {
279             settled->ow1[i]    = iatoms[i*nral1 + 1];
280             settled->hw2[i]    = iatoms[i*nral1 + 2];
281             settled->hw3[i]    = iatoms[i*nral1 + 3];
282             /* We should avoid double counting of virial contributions for
283              * SETTLEs that appear in multiple DD domains, so we only count
284              * the contribution on the home range of the oxygen atom.
285              */
286             settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms->homenr ? 1 : 0);
287         }
288
289         /* Pack the index array to the full SIMD width with copies from
290          * the last normal entry, but with no virial contribution.
291          */
292         int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
293         for (int i = nsettle; i < end_packed; i++)
294         {
295             settled->ow1[i]    = settled->ow1[nsettle - 1];
296             settled->hw2[i]    = settled->hw2[nsettle - 1];
297             settled->hw3[i]    = settled->hw3[nsettle - 1];
298             settled->virfac[i] = 0;
299         }
300     }
301 }
302
303 void settle_proj(gmx_settledata_t settled, int econq,
304                  int nsettle, t_iatom iatoms[],
305                  const t_pbc *pbc,
306                  rvec x[],
307                  rvec *der, rvec *derp,
308                  int calcvir_atom_end, tensor vir_r_m_dder)
309 {
310     /* Settle for projection out constraint components
311      * of derivatives of the coordinates.
312      * Berk Hess 2008-1-10
313      */
314
315     settleparam_t *p;
316     real           imO, imH, dOH, dHH, invdOH, invdHH;
317     matrix         invmat;
318     int            i, m, m2, ow1, hw2, hw3;
319     rvec           roh2, roh3, rhh, dc, fc;
320
321     calcvir_atom_end *= DIM;
322
323     if (econq == econqForce)
324     {
325         p = &settled->mass1;
326     }
327     else
328     {
329         p = &settled->massw;
330     }
331     imO    = p->imO;
332     imH    = p->imH;
333     copy_mat(p->invmat, invmat);
334     dOH    = p->dOH;
335     dHH    = p->dHH;
336     invdOH = p->invdOH;
337     invdHH = p->invdHH;
338
339 #ifdef PRAGMAS
340 #pragma ivdep
341 #endif
342
343     const int nral1 = 1 + NRAL(F_SETTLE);
344
345     for (i = 0; i < nsettle; i++)
346     {
347         ow1 = iatoms[i*nral1 + 1];
348         hw2 = iatoms[i*nral1 + 2];
349         hw3 = iatoms[i*nral1 + 3];
350
351         if (pbc == NULL)
352         {
353             rvec_sub(x[ow1], x[hw2], roh2);
354             rvec_sub(x[ow1], x[hw3], roh3);
355             rvec_sub(x[hw2], x[hw3], rhh);
356         }
357         else
358         {
359             pbc_dx_aiuc(pbc, x[ow1], x[hw2], roh2);
360             pbc_dx_aiuc(pbc, x[ow1], x[hw3], roh3);
361             pbc_dx_aiuc(pbc, x[hw2], x[hw3], rhh);
362         }
363         svmul(invdOH, roh2, roh2);
364         svmul(invdOH, roh3, roh3);
365         svmul(invdHH, rhh, rhh);
366         /* 18 flops */
367
368         /* Determine the projections of der on the bonds */
369         clear_rvec(dc);
370         for (m = 0; m < DIM; m++)
371         {
372             dc[0] += (der[ow1][m] - der[hw2][m])*roh2[m];
373             dc[1] += (der[ow1][m] - der[hw3][m])*roh3[m];
374             dc[2] += (der[hw2][m] - der[hw3][m])*rhh [m];
375         }
376         /* 27 flops */
377
378         /* Determine the correction for the three bonds */
379         mvmul(invmat, dc, fc);
380         /* 15 flops */
381
382         /* Subtract the corrections from derp */
383         for (m = 0; m < DIM; m++)
384         {
385             derp[ow1][m] -= imO*( fc[0]*roh2[m] + fc[1]*roh3[m]);
386             derp[hw2][m] -= imH*(-fc[0]*roh2[m] + fc[2]*rhh [m]);
387             derp[hw3][m] -= imH*(-fc[1]*roh3[m] - fc[2]*rhh [m]);
388         }
389
390         /* 45 flops */
391
392         if (ow1 < calcvir_atom_end)
393         {
394             /* Determining r \dot m der is easy,
395              * since fc contains the mass weighted corrections for der.
396              */
397
398             for (m = 0; m < DIM; m++)
399             {
400                 for (m2 = 0; m2 < DIM; m2++)
401                 {
402                     vir_r_m_dder[m][m2] +=
403                         dOH*roh2[m]*roh2[m2]*fc[0] +
404                         dOH*roh3[m]*roh3[m2]*fc[1] +
405                         dHH*rhh [m]*rhh [m2]*fc[2];
406                 }
407             }
408         }
409     }
410 }
411
412
413 /* The actual settle code, templated for real/SimdReal and for optimization */
414 template<typename T, typename TypeBool, int packSize,
415          typename TypePbc,
416          bool bCorrectVelocity,
417          bool bCalcVirial>
418 static void settleTemplate(const gmx_settledata_t settled,
419                            int settleStart, int settleEnd,
420                            const TypePbc pbc,
421                            const real *x, real *xprime,
422                            real invdt, real * gmx_restrict v,
423                            tensor vir_r_m_dr,
424                            bool *bErrorHasOccurred)
425 {
426     /* ******************************************************************* */
427     /*                                                                  ** */
428     /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
429     /*                                                                  ** */
430     /*    Algorithm changes by Berk Hess:                               ** */
431     /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
432     /*    2006-10-16 Changed velocity update to use differences         ** */
433     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
434     /*    2016-02    Complete rewrite of the code for SIMD              ** */
435     /*                                                                  ** */
436     /*    Reference for the SETTLE algorithm                            ** */
437     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
438     /*                                                                  ** */
439     /* ******************************************************************* */
440
441     assert(settleStart % packSize == 0);
442     assert(settleEnd   % packSize == 0);
443
444     TypeBool       bError = TypeBool(false);
445
446     settleparam_t *p    = &settled->massw;
447     T              wh   = T(p->wh);
448     T              rc   = T(p->rc);
449     T              ra   = T(p->ra);
450     T              rb   = T(p->rb);
451     T              irc2 = T(p->irc2);
452     T              mO   = T(p->mO);
453     T              mH   = T(p->mH);
454
455     T              almost_zero = T(1e-12);
456
457     T              sum_r_m_dr[DIM][DIM];
458
459     if (bCalcVirial)
460     {
461         for (int d2 = 0; d2 < DIM; d2++)
462         {
463             for (int d = 0; d < DIM; d++)
464             {
465                 sum_r_m_dr[d2][d] = T(0);
466             }
467         }
468     }
469
470     for (int i = settleStart; i < settleEnd; i += packSize)
471     {
472         /* Here we pad up to packSize with copies from the last valid entry.
473          * This gives correct results, since we store (not increment) all
474          * output, so we store the same output multiple times.
475          */
476         const int *ow1 = settled->ow1 + i;
477         const int *hw2 = settled->hw2 + i;
478         const int *hw3 = settled->hw3 + i;
479
480         T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
481
482         gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
483         gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
484         gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
485
486         T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
487
488         gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
489         gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
490         gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
491
492         T dist21[DIM], dist31[DIM];
493         T doh2[DIM], doh3[DIM];
494         T sh_hw2[DIM], sh_hw3[DIM];
495
496         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
497
498         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
499
500         /* Tedious way of doing pbc */
501         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
502         for (int d = 0; d < DIM; d++)
503         {
504             sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
505             xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
506         }
507         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
508         for (int d = 0; d < DIM; d++)
509         {
510             sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
511             xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
512         }
513
514         /* Not calculating the center of mass using the oxygen position
515          * and the O-H distances, as done below, will make SETTLE
516          * the largest source of energy drift for simulations of water,
517          * as then the oxygen coordinate is multiplied by 0.89 at every step,
518          * which can then transfer a systematic rounding to the oxygen velocity.
519          */
520         T a1[DIM], com[DIM];
521         for (int d = 0; d < DIM; d++)
522         {
523             a1[d]  = -(doh2[d] + doh3[d]) * wh;
524             com[d] = xprime_ow1[d] - a1[d];
525         }
526         T b1[DIM];
527         for (int d = 0; d < DIM; d++)
528         {
529             b1[d]  = xprime_hw2[d] - com[d];
530         }
531         T c1[DIM];
532         for (int d = 0; d < DIM; d++)
533         {
534             c1[d]  = xprime_hw3[d] - com[d];
535         }
536         /* 15 flops */
537
538         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
539         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
540         T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
541         T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
542         T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
543         T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
544         T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
545         T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
546         T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
547         /* 27 flops */
548
549         T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
550         T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
551         T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
552
553         T trns1[DIM], trns2[DIM], trns3[DIM];
554
555         trns1[XX] = xaksxd * axlng;
556         trns2[XX] = yaksxd * axlng;
557         trns3[XX] = zaksxd * axlng;
558         trns1[YY] = xaksyd * aylng;
559         trns2[YY] = yaksyd * aylng;
560         trns3[YY] = zaksyd * aylng;
561         trns1[ZZ] = xakszd * azlng;
562         trns2[ZZ] = yakszd * azlng;
563         trns3[ZZ] = zakszd * azlng;
564         /* 24 flops */
565
566         T b0d[2], c0d[2];
567
568         for (int d = 0; d < 2; d++)
569         {
570             b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
571             c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
572         }
573
574         T a1d_z, b1d[DIM], c1d[DIM];
575
576         a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
577         for (int d = 0; d < DIM; d++)
578         {
579             b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
580             c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
581         }
582         /* 65 flops */
583
584         T tmp, tmp2;
585
586         T sinphi = a1d_z * gmx::invsqrt(ra*ra);
587         tmp2     = 1.0 - sinphi * sinphi;
588
589         /* If tmp2 gets close to or beyond zero we have severly distorted
590          * water molecules and we should terminate the simulation.
591          * Below we take the max with almost_zero to continue the loop.
592          */
593         bError   = bError || (tmp2 <= almost_zero);
594
595         tmp2     = max(tmp2, almost_zero);
596         tmp      = gmx::invsqrt(tmp2);
597         T cosphi = tmp2*tmp;
598         T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
599         tmp2     = 1.0 - sinpsi * sinpsi;
600
601         T cospsi = tmp2*gmx::invsqrt(tmp2);
602         /* 46 flops */
603
604         T a2d_y  =  ra * cosphi;
605         T b2d_x  = -rc * cospsi;
606         T t1     = -rb * cosphi;
607         T t2     =  rc * sinpsi * sinphi;
608         T b2d_y  =  t1 - t2;
609         T c2d_y  =  t1 + t2;
610         /* 7 flops */
611
612         /*     --- Step3  al,be,ga            --- */
613         T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
614         T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
615         T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
616         T al2be2 = alpha * alpha + beta * beta;
617         tmp2     = (al2be2 - gamma * gamma);
618         T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
619         /* 47 flops */
620
621         /*  --- Step4  A3' --- */
622         tmp2     = 1.0 - sinthe * sinthe;
623         T costhe = tmp2*gmx::invsqrt(tmp2);
624
625         T a3d[DIM], b3d[DIM], c3d[DIM];
626
627         a3d[XX]  = -a2d_y * sinthe;
628         a3d[YY]  = a2d_y * costhe;
629         a3d[ZZ]  = a1d_z;
630         b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
631         b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
632         b3d[ZZ]  = b1d[ZZ];
633         c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
634         c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
635         c3d[ZZ]  = c1d[ZZ];
636         /* 26 flops */
637
638         /*    --- Step5  A3 --- */
639         T a3[DIM], b3[DIM], c3[DIM];
640
641         a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
642         a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
643         a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
644         b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
645         b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
646         b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
647         c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
648         c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
649         c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
650         /* 45 flops */
651
652         /* Compute and store the corrected new coordinate */
653         for (int d = 0; d < DIM; d++)
654         {
655             xprime_ow1[d] = com[d] + a3[d];
656         }
657         for (int d = 0; d < DIM; d++)
658         {
659             xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
660         }
661         for (int d = 0; d < DIM; d++)
662         {
663             xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
664         }
665         /* 9 flops + 6 pbc flops */
666
667         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
668         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
669         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
670
671         // cppcheck-suppress duplicateExpression
672         if (bCorrectVelocity || bCalcVirial)
673         {
674             T da[DIM], db[DIM], dc[DIM];
675             for (int d = 0; d < DIM; d++)
676             {
677                 da[d] = a3[d] - a1[d];
678             }
679             for (int d = 0; d < DIM; d++)
680             {
681                 db[d] = b3[d] - b1[d];
682             }
683             for (int d = 0; d < DIM; d++)
684             {
685                 dc[d] = c3[d] - c1[d];
686             }
687             /* 9 flops */
688
689             if (bCorrectVelocity)
690             {
691                 T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
692
693                 gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
694                 gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
695                 gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
696
697                 /* Add the position correction divided by dt to the velocity */
698                 for (int d = 0; d < DIM; d++)
699                 {
700                     v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
701                 }
702                 for (int d = 0; d < DIM; d++)
703                 {
704                     v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
705                 }
706                 for (int d = 0; d < DIM; d++)
707                 {
708                     v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
709                 }
710                 /* 3*6 flops */
711
712                 transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
713                 transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
714                 transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
715             }
716
717             if (bCalcVirial)
718             {
719                 /* Filter out the non-local settles */
720                 T filter = load(settled->virfac + i);
721                 T mOf    = filter*mO;
722                 T mHf    = filter*mH;
723
724                 T mdo[DIM], mdb[DIM], mdc[DIM];
725
726                 for (int d = 0; d < DIM; d++)
727                 {
728                     mdb[d] = mHf*db[d];
729                     mdc[d] = mHf*dc[d];
730                     mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
731                 }
732
733                 for (int d2 = 0; d2 < DIM; d2++)
734                 {
735                     for (int d = 0; d < DIM; d++)
736                     {
737                         sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
738                             (x_ow1[d2]*mdo[d] +
739                              dist21[d2]*mdb[d] +
740                              dist31[d2]*mdc[d]);
741                     }
742                 }
743                 /* 71 flops */
744             }
745         }
746     }
747
748     if (bCalcVirial)
749     {
750         for (int d2 = 0; d2 < DIM; d2++)
751         {
752             for (int d = 0; d < DIM; d++)
753             {
754                 vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
755             }
756         }
757     }
758
759     *bErrorHasOccurred = anyTrue(bError);
760 }
761
762 /* Wrapper template function that divides the settles over threads
763  * and instantiates the core template with instantiated booleans.
764  */
765 template<typename T, typename TypeBool, int packSize, typename TypePbc>
766 static void settleTemplateWrapper(gmx_settledata_t settled,
767                                   int nthread, int thread,
768                                   TypePbc pbc,
769                                   const real x[], real xprime[],
770                                   real invdt, real *v,
771                                   bool bCalcVirial, tensor vir_r_m_dr,
772                                   bool *bErrorHasOccurred)
773 {
774     /* We need to assign settles to threads in groups of pack_size */
775     int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
776     /* Round the end value up to give thread 0 more work */
777     int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
778     int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
779
780     if (v != NULL)
781     {
782         if (!bCalcVirial)
783         {
784             settleTemplate<T, TypeBool, packSize,
785                            TypePbc,
786                            true,
787                            false>
788                 (settled, settleStart, settleEnd,
789                 pbc,
790                 x, xprime,
791                 invdt, v,
792                 NULL,
793                 bErrorHasOccurred);
794         }
795         else
796         {
797             settleTemplate<T, TypeBool, packSize,
798                            TypePbc,
799                            true,
800                            true>
801                 (settled, settleStart, settleEnd,
802                 pbc,
803                 x, xprime,
804                 invdt, v,
805                 vir_r_m_dr,
806                 bErrorHasOccurred);
807         }
808     }
809     else
810     {
811         if (!bCalcVirial)
812         {
813             settleTemplate<T, TypeBool, packSize,
814                            TypePbc,
815                            false,
816                            false>
817                 (settled, settleStart, settleEnd,
818                 pbc,
819                 x, xprime,
820                 invdt, v,
821                 NULL,
822                 bErrorHasOccurred);
823         }
824         else
825         {
826             settleTemplate<T, TypeBool, packSize,
827                            TypePbc,
828                            false,
829                            true>
830                 (settled, settleStart, settleEnd,
831                 pbc,
832                 x, xprime,
833                 invdt, v,
834                 vir_r_m_dr,
835                 bErrorHasOccurred);
836         }
837     }
838 }
839
840 void csettle(gmx_settledata_t settled,
841              int nthread, int thread,
842              const t_pbc *pbc,
843              const real x[], real xprime[],
844              real invdt, real *v,
845              bool bCalcVirial, tensor vir_r_m_dr,
846              bool *bErrorHasOccurred)
847 {
848 #if GMX_SIMD_HAVE_REAL
849     if (settled->bUseSimd)
850     {
851         /* Convert the pbc struct for SIMD */
852         GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) pbcSimd[9*GMX_SIMD_REAL_WIDTH];
853         set_pbc_simd(pbc, pbcSimd);
854
855         settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
856                               const real *>(settled,
857                                             nthread, thread,
858                                             pbcSimd,
859                                             x, xprime,
860                                             invdt,
861                                             v,
862                                             bCalcVirial, vir_r_m_dr,
863                                             bErrorHasOccurred);
864     }
865     else
866 #endif
867     {
868         /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
869         t_pbc        pbcNo;
870         const t_pbc *pbcNonNull;
871
872         if (pbc != NULL)
873         {
874             pbcNonNull = pbc;
875         }
876         else
877         {
878             set_pbc(&pbcNo, epbcNONE, NULL);
879             pbcNonNull = &pbcNo;
880         }
881
882         settleTemplateWrapper<real, bool, 1,
883                               const t_pbc *>(settled,
884                                              nthread, thread,
885                                              pbcNonNull,
886                                              x, xprime,
887                                              invdt,
888                                              v,
889                                              bCalcVirial, vir_r_m_dr,
890                                              bErrorHasOccurred);
891     }
892 }