Remove COM from SETTLE
authorBerk Hess <hess@kth.se>
Fri, 5 Jun 2020 11:53:37 +0000 (11:53 +0000)
committerPaul Bauer <paul.bauer.q@gmail.com>
Fri, 5 Jun 2020 11:53:37 +0000 (11:53 +0000)
SETTLE computed new postions of the atoms using the center of mass
of the molecule. But this adds rouding errors which lead to extra
energy drift. The use of the COM is now completely avoided, which
significantly improves energy conservation when coordinates are large
and also slighlty improves performance.
Corrected and updated the flop accounting for SETTLE.

docs/reference-manual/algorithms/constraint-algorithms.rst
docs/release-notes/2021/major/features.rst
src/gromacs/gmxlib/nrnb.cpp
src/gromacs/mdlib/settle.cpp
src/gromacs/mdlib/settle_gpu.cu

index baa12db8311e9cc421e4e9d6a312b32ce49e069f..cf5f70f94ecfebf8e9de05a487224d685bf8fdac 100644 (file)
@@ -60,7 +60,17 @@ SETTLE
 For the special case of rigid
 water molecules, that often make up more than 80% of the simulation
 system we have implemented the SETTLE algorithm \ :ref:`47 <refMiyamoto92>`
-(sec. :ref:`constraintalg`).
+(sec. :ref:`constraintalg`). The implementation of SETTLE in |Gromacs|
+is a slight modification of the original algorithm, in that it completely
+avoids the calculation of the center of mass of the water molecule.
+Apart from saving a few operations, the main gain of this is a reduction
+in rouding errors. For large coordinates, the floating pointing precision of constrained
+distances is reduced, which leads to an energy drift which usually depends
+quadratically on the coordinate. For SETTLE this dependence is now linear, which enables
+accurate integration of systems in single precision up to 1000 nm in size.
+But note that the drift due to SHAKE and LINCS still has a quadratic
+dependence, which limits the size of systems with normal constraints
+in single precision to 100 to 200 nm.
 
 For velocity Verlet, an additional round of constraining must be done,
 to constrain the velocities of the second velocity half step, removing
index 5626dc8d5462550503337c0106ea3f60b9df0848..82a35ba5d15d17f57408bf2c4b1c61809d7d5bd4 100644 (file)
@@ -7,3 +7,13 @@ New and improved features
    Also, please use the syntax :issue:`number` to reference issues on GitLab, without the
    a space between the colon and number!
 
+Lower energy drift due to SETTLE
+""""""""""""""""""""""""""""""""
+
+|Gromacs| already applied an improvement to the center of mass calculation in
+SETTLE to reduce energy drift in single precision. Now the center of mass
+calculation is completely avoided, which significantly reduces the energy
+drift when large coordinate values are present. This allows for accurate
+simulations of systems with SETTLE up to 1000 nm in size (but note that
+constraining with LINCS and SHAKE still introduces significant drift,
+which limits the system size to 100 to 200 nm).
index 0abe2ee2294e82858ef61a8c6edf1b8cf14a7f68..1fca45d7cddb063fb7e15a2a27810514ac6dcfa5 100644 (file)
@@ -171,10 +171,10 @@ static const t_nrnb_data nbdata[eNRNB] = {
     { "Lincs", 60 },
     { "Lincs-Mat", 4 },
     { "Shake", 30 },
-    { "Constraint-V", 8 },
+    { "Constraint-V", 9 },
     { "Shake-Init", 10 },
     { "Constraint-Vir", 24 },
-    { "Settle", 323 },
+    { "Settle", 370 },
     { "Virtual Site 2", 23 },
     { "Virtual Site 2fd", 63 },
     { "Virtual Site 3", 37 },
index fce4cce684c93ebfc2ba613c4048c791d1fed605..508bafc750c96cdce13d95da186ed5db65dd6c6b 100644 (file)
@@ -388,6 +388,7 @@ static void settleTemplate(const SettleData& settled,
     /*    2006-10-16 Changed velocity update to use differences         ** */
     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
     /*    2016-02    Complete rewrite of the code for SIMD              ** */
+    /*    2020-06    Completely remove use of COM to minimize drift     ** */
     /*                                                                  ** */
     /*    Reference for the SETTLE algorithm                            ** */
     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
@@ -447,49 +448,43 @@ static void settleTemplate(const SettleData& settled,
 
         T dist21[DIM], dist31[DIM];
         T doh2[DIM], doh3[DIM];
-        T sh_hw2[DIM], sh_hw3[DIM];
 
         pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
 
         pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
 
-        /* Tedious way of doing pbc */
         pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
-        for (int d = 0; d < DIM; d++)
-        {
-            sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
-            xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
-        }
+
         pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
-        for (int d = 0; d < DIM; d++)
-        {
-            sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
-            xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
-        }
+        /* 4 * 18 flops (would be 4 * 3 without PBC) */
 
-        /* Not calculating the center of mass using the oxygen position
-         * and the O-H distances, as done below, will make SETTLE
-         * the largest source of energy drift for simulations of water,
+        /* Note that we completely avoid computing the center of mass and
+         * only use distances. This minimizes energy drift and also makes
+         * the computation slightly cheaper.
+         * Straightforward computation of the COM, as in the original algorithm,
+         * makes SETTLE the largest source of energy drift for simulations of water,
          * as then the oxygen coordinate is multiplied by 0.89 at every step,
          * which can then transfer a systematic rounding to the oxygen velocity.
+         * For some time we computed the COM using offsets from the oxygen, this
+         * significantly reduces the energy drift, but not using the COM at all,
+         * as we do now, is optimal.
          */
-        T a1[DIM], com[DIM];
+        T a1[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            a1[d]  = -(doh2[d] + doh3[d]) * wh;
-            com[d] = xprime_ow1[d] - a1[d];
+            a1[d] = -(doh2[d] + doh3[d]) * wh;
         }
         T b1[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            b1[d] = xprime_hw2[d] - com[d];
+            b1[d] = doh2[d] + a1[d];
         }
         T c1[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            c1[d] = xprime_hw3[d] - com[d];
+            c1[d] = doh3[d] + a1[d];
         }
-        /* 15 flops */
+        /* 12 flops */
 
         T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
         T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
@@ -606,96 +601,84 @@ static void settleTemplate(const SettleData& settled,
         /* 45 flops */
 
         /* Compute and store the corrected new coordinate */
+        T dxOw1[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            xprime_ow1[d] = com[d] + a3[d];
+            dxOw1[d]      = a3[d] - a1[d];
+            xprime_ow1[d] = xprime_ow1[d] + dxOw1[d];
         }
+        T dxHw2[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];
+            dxHw2[d]      = b3[d] - b1[d];
+            xprime_hw2[d] = xprime_hw2[d] + dxHw2[d];
         }
+        T dxHw3[DIM];
         for (int d = 0; d < DIM; d++)
         {
-            xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
+            dxHw3[d]      = c3[d] - c1[d];
+            xprime_hw3[d] = xprime_hw3[d] + dxHw3[d];
         }
-        /* 9 flops + 6 pbc flops */
+        /* 9 + 9 flops */
 
         transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
         transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
         transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
 
-        if (bCorrectVelocity || bCalcVirial)
+        if (bCorrectVelocity)
         {
-            T da[DIM], db[DIM], dc[DIM];
+            T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
+
+            gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
+            gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
+            gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
+
+            /* Add the position correction divided by dt to the velocity */
             for (int d = 0; d < DIM; d++)
             {
-                da[d] = a3[d] - a1[d];
+                v_ow1[d] = gmx::fma(dxOw1[d], invdt, v_ow1[d]);
             }
             for (int d = 0; d < DIM; d++)
             {
-                db[d] = b3[d] - b1[d];
+                v_hw2[d] = gmx::fma(dxHw2[d], invdt, v_hw2[d]);
             }
             for (int d = 0; d < DIM; d++)
             {
-                dc[d] = c3[d] - c1[d];
+                v_hw3[d] = gmx::fma(dxHw3[d], invdt, v_hw3[d]);
             }
-            /* 9 flops */
+            /* 3*6 flops */
 
-            if (bCorrectVelocity)
-            {
-                T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
+            transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
+            transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
+            transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
+        }
 
-                gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
-                gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
-                gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
+        if (bCalcVirial)
+        {
+            /* Filter out the non-local settles */
+            T filter = load<T>(settled.virfac() + i);
+            T mOf    = filter * mO;
+            T mHf    = filter * mH;
 
-                /* Add the position correction divided by dt to the velocity */
-                for (int d = 0; d < DIM; d++)
-                {
-                    v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
-                }
-                for (int d = 0; d < DIM; d++)
-                {
-                    v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
-                }
-                for (int d = 0; d < DIM; d++)
-                {
-                    v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
-                }
-                /* 3*6 flops */
+            T mdo[DIM], mdb[DIM], mdc[DIM];
 
-                transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
-                transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
-                transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
+            for (int d = 0; d < DIM; d++)
+            {
+                mdb[d] = mHf * dxHw2[d];
+                mdc[d] = mHf * dxHw3[d];
+                mdo[d] = mOf * dxOw1[d] + mdb[d] + mdc[d];
             }
 
-            if (bCalcVirial)
+            for (int d2 = 0; d2 < DIM; d2++)
             {
-                /* Filter out the non-local settles */
-                T filter = load<T>(settled.virfac() + i);
-                T mOf    = filter * mO;
-                T mHf    = filter * mH;
-
-                T mdo[DIM], mdb[DIM], mdc[DIM];
-
                 for (int d = 0; d < DIM; d++)
                 {
-                    mdb[d] = mHf * db[d];
-                    mdc[d] = mHf * dc[d];
-                    mdo[d] = mOf * da[d] + mdb[d] + mdc[d];
-                }
-
-                for (int d2 = 0; d2 < DIM; d2++)
-                {
-                    for (int d = 0; d < DIM; d++)
-                    {
-                        sum_r_m_dr[d2][d] =
-                                sum_r_m_dr[d2][d]
-                                - (x_ow1[d2] * mdo[d] + dist21[d2] * mdb[d] + dist31[d2] * mdc[d]);
-                    }
+                    sum_r_m_dr[d2][d] =
+                            sum_r_m_dr[d2][d]
+                            - (x_ow1[d2] * mdo[d] + dist21[d2] * mdb[d] + dist31[d2] * mdc[d]);
                 }
-                /* 71 flops */
             }
+            /* 71 flops */
         }
     }
 
index 804fab5e51c25778142c9e7be5229e5885295669..8b95a9fec5df227b842e8cda895897cc705103f7 100644 (file)
@@ -112,6 +112,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     /*    2006-10-16 Changed velocity update to use differences         ** */
     /*    2012-09-24 Use oxygen as reference instead of COM             ** */
     /*    2016-02    Complete rewrite of the code for SIMD              ** */
+    /*    2020-06    Completely remove use of COM to minimize drift     ** */
     /*                                                                  ** */
     /*    Reference for the SETTLE algorithm                            ** */
     /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
@@ -142,20 +143,13 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         float3 dist31 = pbcDxAiuc(pbcAiuc, x_hw3, x_ow1);
         float3 doh2   = pbcDxAiuc(pbcAiuc, xprime_hw2, xprime_ow1);
 
-        float3 sh_hw2 = xprime_hw2 - (xprime_ow1 + doh2);
-        xprime_hw2    = xprime_hw2 - sh_hw2;
-
         float3 doh3 = pbcDxAiuc(pbcAiuc, xprime_hw3, xprime_ow1);
 
-        float3 sh_hw3 = xprime_hw3 - (xprime_ow1 + doh3);
-        xprime_hw3    = xprime_hw3 - sh_hw3;
-
-        float3 a1  = (-doh2 - doh3) * pars.wh;
-        float3 com = xprime_ow1 - a1;
+        float3 a1 = (-doh2 - doh3) * pars.wh;
 
-        float3 b1 = xprime_hw2 - com;
+        float3 b1 = doh2 + a1;
 
-        float3 c1 = xprime_hw3 - com;
+        float3 c1 = doh3 + a1;
 
         float xakszd = dist21.y * dist31.z - dist21.z * dist31.y;
         float yakszd = dist21.z * dist31.x - dist21.x * dist31.z;
@@ -274,59 +268,48 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
 
 
         /* Compute and store the corrected new coordinate */
-        xprime_ow1 = com + a3;
-        xprime_hw2 = com + b3 + sh_hw2;
-        xprime_hw3 = com + c3 + sh_hw3;
+        const float3 dxOw1 = a3 - a1;
+        const float3 dxHw2 = b3 - b1;
+        const float3 dxHw3 = c3 - c1;
 
-        gm_xprime[indices.x] = xprime_ow1;
-        gm_xprime[indices.y] = xprime_hw2;
-        gm_xprime[indices.z] = xprime_hw3;
+        gm_xprime[indices.x] = xprime_ow1 + dxOw1;
+        gm_xprime[indices.y] = xprime_hw2 + dxHw2;
+        gm_xprime[indices.z] = xprime_hw3 + dxHw3;
 
-
-        if (updateVelocities || computeVirial)
+        if (updateVelocities)
         {
+            float3 v_ow1 = gm_v[indices.x];
+            float3 v_hw2 = gm_v[indices.y];
+            float3 v_hw3 = gm_v[indices.z];
+
+            /* Add the position correction divided by dt to the velocity */
+            v_ow1 = dxOw1 * invdt + v_ow1;
+            v_hw2 = dxHw2 * invdt + v_hw2;
+            v_hw3 = dxHw3 * invdt + v_hw3;
+
+            gm_v[indices.x] = v_ow1;
+            gm_v[indices.y] = v_hw2;
+            gm_v[indices.z] = v_hw3;
+        }
 
-            float3 da = a3 - a1;
-            float3 db = b3 - b1;
-            float3 dc = c3 - c1;
-
-            if (updateVelocities)
-            {
-
-                float3 v_ow1 = gm_v[indices.x];
-                float3 v_hw2 = gm_v[indices.y];
-                float3 v_hw3 = gm_v[indices.z];
-
-                /* Add the position correction divided by dt to the velocity */
-                v_ow1 = da * invdt + v_ow1;
-                v_hw2 = db * invdt + v_hw2;
-                v_hw3 = dc * invdt + v_hw3;
-
-                gm_v[indices.x] = v_ow1;
-                gm_v[indices.y] = v_hw2;
-                gm_v[indices.z] = v_hw3;
-            }
-
-            if (computeVirial)
-            {
-
-                float3 mdb = pars.mH * db;
-                float3 mdc = pars.mH * dc;
-                float3 mdo = pars.mO * da + mdb + mdc;
-
-                sm_threadVirial[0 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.x * mdo.x + dist21.x * mdb.x + dist31.x * mdc.x);
-                sm_threadVirial[1 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.x * mdo.y + dist21.x * mdb.y + dist31.x * mdc.y);
-                sm_threadVirial[2 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.x * mdo.z + dist21.x * mdb.z + dist31.x * mdc.z);
-                sm_threadVirial[3 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.y * mdo.y + dist21.y * mdb.y + dist31.y * mdc.y);
-                sm_threadVirial[4 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.y * mdo.z + dist21.y * mdb.z + dist31.y * mdc.z);
-                sm_threadVirial[5 * blockDim.x + threadIdx.x] =
-                        -(x_ow1.z * mdo.z + dist21.z * mdb.z + dist31.z * mdc.z);
-            }
+        if (computeVirial)
+        {
+            float3 mdb = pars.mH * dxHw2;
+            float3 mdc = pars.mH * dxHw3;
+            float3 mdo = pars.mO * dxOw1 + mdb + mdc;
+
+            sm_threadVirial[0 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.x * mdo.x + dist21.x * mdb.x + dist31.x * mdc.x);
+            sm_threadVirial[1 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.x * mdo.y + dist21.x * mdb.y + dist31.x * mdc.y);
+            sm_threadVirial[2 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.x * mdo.z + dist21.x * mdb.z + dist31.x * mdc.z);
+            sm_threadVirial[3 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.y * mdo.y + dist21.y * mdb.y + dist31.y * mdc.y);
+            sm_threadVirial[4 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.y * mdo.z + dist21.y * mdb.z + dist31.y * mdc.z);
+            sm_threadVirial[5 * blockDim.x + threadIdx.x] =
+                    -(x_ow1.z * mdo.z + dist21.z * mdb.z + dist31.z * mdc.z);
         }
     }
     else