src/gromacs/mdlib/clincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* This file is completely threadsafe - keep it that way! */
  38 #include "gmxpre.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <math.h>
  44 #include <stdlib.h>
  45
  46 #include <algorithm>
  47
  48 #include "gromacs/domdec/domdec.h"
  49 #include "gromacs/fileio/gmxfio.h"
  50 #include "gromacs/legacyheaders/constr.h"
  51 #include "gromacs/legacyheaders/copyrite.h"
  52 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  53 #include "gromacs/legacyheaders/mdrun.h"
  54 #include "gromacs/legacyheaders/nrnb.h"
  55 #include "gromacs/legacyheaders/types/commrec.h"
  56 #include "gromacs/math/units.h"
  57 #include "gromacs/math/vec.h"
  58 #include "gromacs/pbcutil/pbc-simd.h"
  59 #include "gromacs/pbcutil/pbc.h"
  60 #include "gromacs/simd/simd.h"
  61 #include "gromacs/simd/simd_math.h"
  62 #include "gromacs/simd/vector_operations.h"
  63 #include "gromacs/topology/block.h"
  64 #include "gromacs/topology/mtop_util.h"
  65 #include "gromacs/utility/bitmask.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/gmxomp.h"
  68 #include "gromacs/utility/smalloc.h"
  69
  70 /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
  71 #if defined GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700)
  72 #define LINCS_SIMD
  73 #endif
  74
  75 typedef struct {
  76     int    b0;         /* first constraint for this thread */
  77     int    b1;         /* b1-1 is the last constraint for this thread */
  78     int    nind;       /* number of indices */
  79     int   *ind;        /* constraint index for updating atom data */
  80     int    nind_r;     /* number of indices */
  81     int   *ind_r;      /* constraint index for updating atom data */
  82     int    ind_nalloc; /* allocation size of ind and ind_r */
  83     tensor vir_r_m_dr; /* temporary variable for virial calculation */
  84     real   dhdlambda;  /* temporary variable for lambda derivative */
  85     real  *simd_buf;   /* Aligned work array for SIMD */
  86 } lincs_thread_t;
  87
  88 typedef struct gmx_lincsdata {
  89     int             ncg;          /* the global number of constraints */
  90     int             ncg_flex;     /* the global number of flexible constraints */
  91     int             ncg_triangle; /* the global number of constraints in triangles */
  92     int             nIter;        /* the number of iterations */
  93     int             nOrder;       /* the order of the matrix expansion */
  94     int             nc;           /* the number of constraints */
  95     int             nc_alloc;     /* the number we allocated memory for */
  96     int             ncc;          /* the number of constraint connections */
  97     int             ncc_alloc;    /* the number we allocated memory for */
  98     real            matlam;       /* the FE lambda value used for filling blc and blmf */
  99     real           *bllen0;       /* the reference distance in topology A */
 100     real           *ddist;        /* the reference distance in top B - the r.d. in top A */
 101     int            *bla;          /* the atom pairs involved in the constraints */
 102     real           *blc;          /* 1/sqrt(invmass1 + invmass2) */
 103     real           *blc1;         /* as blc, but with all masses 1 */
 104     int            *blnr;         /* index into blbnb and blmf */
 105     int            *blbnb;        /* list of constraint connections */
 106     int             ntriangle;    /* the local number of constraints in triangles */
 107     int            *triangle;     /* the list of triangle constraints */
 108     int            *tri_bits;     /* the bits tell if the matrix element should be used */
 109     int             ncc_triangle; /* the number of constraint connections in triangles */
 110     gmx_bool        bCommIter;    /* communicate before each LINCS interation */
 111     real           *blmf;         /* matrix of mass factors for constraint connections */
 112     real           *blmf1;        /* as blmf, but with all masses 1 */
 113     real           *bllen;        /* the reference bond length */
 114     int             nth;          /* The number of threads doing LINCS */
 115     lincs_thread_t *th;           /* LINCS thread division */
 116     gmx_bitmask_t  *atf;          /* atom flags for thread parallelization */
 117     int             atf_nalloc;   /* allocation size of atf */
 118     /* arrays for temporary storage in the LINCS algorithm */
 119     rvec           *tmpv;
 120     real           *tmpncc;
 121     real           *tmp1;
 122     real           *tmp2;
 123     real           *tmp3;
 124     real           *tmp4;
 125     real           *mlambda; /* the Lagrange multipliers * -1 */
 126     /* storage for the constraint RMS relative deviation output */
 127     real            rmsd_data[3];
 128 } t_gmx_lincsdata;
 129
 130 /* Define simd_width for memory allocation used for SIMD code */
 131 #ifdef LINCS_SIMD
 132 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 133 #else
 134 static const int simd_width = 1;
 135 #endif
 136 /* We can't use small memory alignments on many systems, so use min 64 bytes*/
 137 static const int align_bytes = std::max<int>(64, simd_width*sizeof(real));
 138
 139 real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
 140 {
 141     return lincsd->rmsd_data;
 142 }
 143
 144 real lincs_rmsd(struct gmx_lincsdata *lincsd, gmx_bool bSD2)
 145 {
 146     if (lincsd->rmsd_data[0] > 0)
 147     {
 148         return sqrt(lincsd->rmsd_data[bSD2 ? 2 : 1]/lincsd->rmsd_data[0]);
 149     }
 150     else
 151     {
 152         return 0;
 153     }
 154 }
 155
 156 /* Do a set of nrec LINCS matrix multiplications.
 157  * This function will return with up to date thread-local
 158  * constraint data, without an OpenMP barrier.
 159  */
 160 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
 161                                 int b0, int b1,
 162                                 const real *blcc,
 163                                 real *rhs1, real *rhs2, real *sol)
 164 {
 165     int        nrec, rec, b, j, n, nr0, nr1;
 166     real       mvb, *swap;
 167     int        ntriangle, tb, bits;
 168     const int *blnr     = lincsd->blnr, *blbnb = lincsd->blbnb;
 169     const int *triangle = lincsd->triangle, *tri_bits = lincsd->tri_bits;
 170
 171     ntriangle = lincsd->ntriangle;
 172     nrec      = lincsd->nOrder;
 173
 174     for (rec = 0; rec < nrec; rec++)
 175     {
 176 #pragma omp barrier
 177         for (b = b0; b < b1; b++)
 178         {
 179             mvb = 0;
 180             for (n = blnr[b]; n < blnr[b+1]; n++)
 181             {
 182                 j   = blbnb[n];
 183                 mvb = mvb + blcc[n]*rhs1[j];
 184             }
 185             rhs2[b] = mvb;
 186             sol[b]  = sol[b] + mvb;
 187         }
 188         swap = rhs1;
 189         rhs1 = rhs2;
 190         rhs2 = swap;
 191     } /* nrec*(ncons+2*nrtot) flops */
 192
 193     if (ntriangle > 0)
 194     {
 195         /* Perform an extra nrec recursions for only the constraints
 196          * involved in rigid triangles.
 197          * In this way their accuracy should come close to those of the other
 198          * constraints, since traingles of constraints can produce eigenvalues
 199          * around 0.7, while the effective eigenvalue for bond constraints
 200          * is around 0.4 (and 0.7*0.7=0.5).
 201          */
 202         /* We need to copy the temporary array, since only the elements
 203          * for constraints involved in triangles are updated and then
 204          * the pointers are swapped. This saves copying the whole array.
 205          * We need a barrier as other threads might still be reading from rhs2.
 206          */
 207 #pragma omp barrier
 208         for (b = b0; b < b1; b++)
 209         {
 210             rhs2[b] = rhs1[b];
 211         }
 212 #pragma omp barrier
 213 #pragma omp master
 214         {
 215             for (rec = 0; rec < nrec; rec++)
 216             {
 217                 for (tb = 0; tb < ntriangle; tb++)
 218                 {
 219                     b    = triangle[tb];
 220                     bits = tri_bits[tb];
 221                     mvb  = 0;
 222                     nr0  = blnr[b];
 223                     nr1  = blnr[b+1];
 224                     for (n = nr0; n < nr1; n++)
 225                     {
 226                         if (bits & (1<<(n-nr0)))
 227                         {
 228                             j   = blbnb[n];
 229                             mvb = mvb + blcc[n]*rhs1[j];
 230                         }
 231                     }
 232                     rhs2[b] = mvb;
 233                     sol[b]  = sol[b] + mvb;
 234                 }
 235                 swap = rhs1;
 236                 rhs1 = rhs2;
 237                 rhs2 = swap;
 238             }
 239         } /* flops count is missing here */
 240
 241         /* We need a barrier here as the calling routine will continue
 242          * to operate on the thread-local constraints without barrier.
 243          */
 244 #pragma omp barrier
 245     }
 246 }
 247
 248 static void lincs_update_atoms_noind(int ncons, const int *bla,
 249                                      real prefac,
 250                                      const real *fac, rvec *r,
 251                                      const real *invmass,
 252                                      rvec *x)
 253 {
 254     int  b, i, j;
 255     real mvb, im1, im2, tmp0, tmp1, tmp2;
 256
 257     if (invmass != NULL)
 258     {
 259         for (b = 0; b < ncons; b++)
 260         {
 261             i        = bla[2*b];
 262             j        = bla[2*b+1];
 263             mvb      = prefac*fac[b];
 264             im1      = invmass[i];
 265             im2      = invmass[j];
 266             tmp0     = r[b][0]*mvb;
 267             tmp1     = r[b][1]*mvb;
 268             tmp2     = r[b][2]*mvb;
 269             x[i][0] -= tmp0*im1;
 270             x[i][1] -= tmp1*im1;
 271             x[i][2] -= tmp2*im1;
 272             x[j][0] += tmp0*im2;
 273             x[j][1] += tmp1*im2;
 274             x[j][2] += tmp2*im2;
 275         } /* 16 ncons flops */
 276     }
 277     else
 278     {
 279         for (b = 0; b < ncons; b++)
 280         {
 281             i        = bla[2*b];
 282             j        = bla[2*b+1];
 283             mvb      = prefac*fac[b];
 284             tmp0     = r[b][0]*mvb;
 285             tmp1     = r[b][1]*mvb;
 286             tmp2     = r[b][2]*mvb;
 287             x[i][0] -= tmp0;
 288             x[i][1] -= tmp1;
 289             x[i][2] -= tmp2;
 290             x[j][0] += tmp0;
 291             x[j][1] += tmp1;
 292             x[j][2] += tmp2;
 293         }
 294     }
 295 }
 296
 297 static void lincs_update_atoms_ind(int ncons, const int *ind, const int *bla,
 298                                    real prefac,
 299                                    const real *fac, rvec *r,
 300                                    const real *invmass,
 301                                    rvec *x)
 302 {
 303     int  bi, b, i, j;
 304     real mvb, im1, im2, tmp0, tmp1, tmp2;
 305
 306     if (invmass != NULL)
 307     {
 308         for (bi = 0; bi < ncons; bi++)
 309         {
 310             b        = ind[bi];
 311             i        = bla[2*b];
 312             j        = bla[2*b+1];
 313             mvb      = prefac*fac[b];
 314             im1      = invmass[i];
 315             im2      = invmass[j];
 316             tmp0     = r[b][0]*mvb;
 317             tmp1     = r[b][1]*mvb;
 318             tmp2     = r[b][2]*mvb;
 319             x[i][0] -= tmp0*im1;
 320             x[i][1] -= tmp1*im1;
 321             x[i][2] -= tmp2*im1;
 322             x[j][0] += tmp0*im2;
 323             x[j][1] += tmp1*im2;
 324             x[j][2] += tmp2*im2;
 325         } /* 16 ncons flops */
 326     }
 327     else
 328     {
 329         for (bi = 0; bi < ncons; bi++)
 330         {
 331             b        = ind[bi];
 332             i        = bla[2*b];
 333             j        = bla[2*b+1];
 334             mvb      = prefac*fac[b];
 335             tmp0     = r[b][0]*mvb;
 336             tmp1     = r[b][1]*mvb;
 337             tmp2     = r[b][2]*mvb;
 338             x[i][0] -= tmp0;
 339             x[i][1] -= tmp1;
 340             x[i][2] -= tmp2;
 341             x[j][0] += tmp0;
 342             x[j][1] += tmp1;
 343             x[j][2] += tmp2;
 344         } /* 16 ncons flops */
 345     }
 346 }
 347
 348 static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
 349                                real prefac,
 350                                const real *fac, rvec *r,
 351                                const real *invmass,
 352                                rvec *x)
 353 {
 354     if (li->nth == 1)
 355     {
 356         /* Single thread, we simply update for all constraints */
 357         lincs_update_atoms_noind(li->nc, li->bla, prefac, fac, r, invmass, x);
 358     }
 359     else
 360     {
 361         /* Update the atom vector components for our thread local
 362          * constraints that only access our local atom range.
 363          * This can be done without a barrier.
 364          */
 365         lincs_update_atoms_ind(li->th[th].nind, li->th[th].ind,
 366                                li->bla, prefac, fac, r, invmass, x);
 367
 368         if (li->th[li->nth].nind > 0)
 369         {
 370             /* Update the constraints that operate on atoms
 371              * in multiple thread atom blocks on the master thread.
 372              */
 373 #pragma omp barrier
 374 #pragma omp master
 375             {
 376                 lincs_update_atoms_ind(li->th[li->nth].nind,
 377                                        li->th[li->nth].ind,
 378                                        li->bla, prefac, fac, r, invmass, x);
 379             }
 380         }
 381     }
 382 }
 383
 384 #ifdef LINCS_SIMD
 385 /* Calculate distances between indexed atom coordinate pairs
 386  * and store the result in 3 SIMD registers through an aligned buffer.
 387  * Start from index is and load GMX_SIMD_REAL_WIDTH elements.
 388  * Note that pair_index must contain valid indices up to is+GMX_SIMD_REAL_WIDTH.
 389  */
 390 static gmx_inline void gmx_simdcall
 391 rvec_dist_to_simd(const rvec      *x,
 392                   int              is,
 393                   const int       *pair_index,
 394                   real            *buf,
 395                   gmx_simd_real_t *dx,
 396                   gmx_simd_real_t *dy,
 397                   gmx_simd_real_t *dz)
 398 {
 399     int s, m;
 400
 401     for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
 402     {
 403         /* Store the non PBC corrected distances packed and aligned */
 404         for (m = 0; m < DIM; m++)
 405         {
 406             buf[m*GMX_SIMD_REAL_WIDTH + s] =
 407                 x[pair_index[2*(is+s)]][m] - x[pair_index[2*(is+s) + 1]][m];
 408         }
 409     }
 410     *dx = gmx_simd_load_r(buf + 0*GMX_SIMD_REAL_WIDTH);
 411     *dy = gmx_simd_load_r(buf + 1*GMX_SIMD_REAL_WIDTH);
 412     *dz = gmx_simd_load_r(buf + 2*GMX_SIMD_REAL_WIDTH);
 413 }
 414
 415 /* Store a SIMD vector in GMX_SIMD_REAL_WIDTH rvecs through an aligned buffer */
 416 static gmx_inline void gmx_simdcall
 417 copy_simd_vec_to_rvec(gmx_simd_real_t  x,
 418                       gmx_simd_real_t  y,
 419                       gmx_simd_real_t  z,
 420                       real            *buf,
 421                       rvec            *v,
 422                       int              is)
 423 {
 424     int s, m;
 425
 426     gmx_simd_store_r(buf + 0*GMX_SIMD_REAL_WIDTH, x);
 427     gmx_simd_store_r(buf + 1*GMX_SIMD_REAL_WIDTH, y);
 428     gmx_simd_store_r(buf + 2*GMX_SIMD_REAL_WIDTH, z);
 429
 430     for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
 431     {
 432         for (m = 0; m < DIM; m++)
 433         {
 434             v[is + s][m] = buf[m*GMX_SIMD_REAL_WIDTH + s];
 435         }
 436     }
 437 }
 438
 439 /* Calculate the constraint distance vectors r to project on from x.
 440  * Determine the right-hand side of the matrix equation using quantity f.
 441  * This function only differs from calc_dr_x_xp_simd below in that
 442  * no constraint length is subtracted and no PBC is used for f.
 443  */
 444 static void gmx_simdcall
 445 calc_dr_x_f_simd(int                       b0,
 446                  int                       b1,
 447                  const int *               bla,
 448                  const rvec * gmx_restrict x,
 449                  const rvec * gmx_restrict f,
 450                  const real * gmx_restrict blc,
 451                  const pbc_simd_t *        pbc_simd,
 452                  real * gmx_restrict       vbuf1,
 453                  real * gmx_restrict       vbuf2,
 454                  rvec * gmx_restrict       r,
 455                  real * gmx_restrict       rhs,
 456                  real * gmx_restrict       sol)
 457 {
 458     int bs;
 459
 460     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 461
 462     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 463     {
 464         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
 465         gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
 466
 467         rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
 468
 469         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 470
 471         n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
 472         il_S  = gmx_simd_invsqrt_r(n2_S);
 473
 474         rx_S  = gmx_simd_mul_r(rx_S, il_S);
 475         ry_S  = gmx_simd_mul_r(ry_S, il_S);
 476         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 477
 478         copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
 479
 480         rvec_dist_to_simd(f, bs, bla, vbuf2, &fx_S, &fy_S, &fz_S);
 481
 482         ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
 483                                  fx_S, fy_S, fz_S);
 484
 485         rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs), ip_S);
 486
 487         gmx_simd_store_r(rhs + bs, rhs_S);
 488         gmx_simd_store_r(sol + bs, rhs_S);
 489     }
 490 }
 491 #endif /* LINCS_SIMD */
 492
 493 /* LINCS projection, works on derivatives of the coordinates */
 494 static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 495                       struct gmx_lincsdata *lincsd, int th,
 496                       real *invmass,
 497                       int econq, gmx_bool bCalcDHDL,
 498                       gmx_bool bCalcVir, tensor rmdf)
 499 {
 500     int      b0, b1, b;
 501     int     *bla, *blnr, *blbnb;
 502     rvec    *r;
 503     real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
 504
 505     b0 = lincsd->th[th].b0;
 506     b1 = lincsd->th[th].b1;
 507
 508     bla    = lincsd->bla;
 509     r      = lincsd->tmpv;
 510     blnr   = lincsd->blnr;
 511     blbnb  = lincsd->blbnb;
 512     if (econq != econqForce)
 513     {
 514         /* Use mass-weighted parameters */
 515         blc  = lincsd->blc;
 516         blmf = lincsd->blmf;
 517     }
 518     else
 519     {
 520         /* Use non mass-weighted parameters */
 521         blc  = lincsd->blc1;
 522         blmf = lincsd->blmf1;
 523     }
 524     blcc   = lincsd->tmpncc;
 525     rhs1   = lincsd->tmp1;
 526     rhs2   = lincsd->tmp2;
 527     sol    = lincsd->tmp3;
 528
 529 #ifdef LINCS_SIMD
 530
 531     /* This SIMD code does the same as the plain-C code after the #else.
 532      * The only difference is that we always call pbc code, as with SIMD
 533      * the overhead of pbc computation (when not needed) is small.
 534      */
 535     pbc_simd_t pbc_simd;
 536
 537     /* Convert the pbc struct for SIMD */
 538     set_pbc_simd(pbc, &pbc_simd);
 539
 540     /* Compute normalized x i-j vectors, store in r.
 541      * Compute the inner product of r and xp i-j and store in rhs1.
 542      */
 543     calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
 544                      &pbc_simd,
 545                      lincsd->th[th].simd_buf,
 546                      lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
 547                      r, rhs1, sol);
 548
 549 #else /* LINCS_SIMD */
 550
 551     /* Compute normalized i-j vectors */
 552     if (pbc)
 553     {
 554         for (b = b0; b < b1; b++)
 555         {
 556             rvec dx;
 557
 558             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 559             unitv(dx, r[b]);
 560         }
 561     }
 562     else
 563     {
 564         for (b = b0; b < b1; b++)
 565         {
 566             rvec dx;
 567
 568             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
 569             unitv(dx, r[b]);
 570         } /* 16 ncons flops */
 571     }
 572
 573     for (b = b0; b < b1; b++)
 574     {
 575         int  i, j;
 576         real mvb;
 577
 578         i       = bla[2*b];
 579         j       = bla[2*b+1];
 580         mvb     = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
 581                           r[b][1]*(f[i][1] - f[j][1]) +
 582                           r[b][2]*(f[i][2] - f[j][2]));
 583         rhs1[b] = mvb;
 584         sol[b]  = mvb;
 585         /* 7 flops */
 586     }
 587
 588 #endif /* LINCS_SIMD */
 589
 590     /* We need a barrier, since the matrix construction below
 591      * can access entries in r of other threads.
 592      */
 593 #pragma omp barrier
 594
 595     /* Construct the (sparse) LINCS matrix */
 596     for (b = b0; b < b1; b++)
 597     {
 598         int n;
 599
 600         for (n = blnr[b]; n < blnr[b+1]; n++)
 601         {
 602             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
 603         } /* 6 nr flops */
 604     }
 605     /* Together: 23*ncons + 6*nrtot flops */
 606
 607     lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
 608     /* nrec*(ncons+2*nrtot) flops */
 609
 610     if (econq == econqDeriv_FlexCon)
 611     {
 612         /* We only want to constraint the flexible constraints,
 613          * so we mask out the normal ones by setting sol to 0.
 614          */
 615         for (b = b0; b < b1; b++)
 616         {
 617             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 618             {
 619                 sol[b] = 0;
 620             }
 621         }
 622     }
 623
 624     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 625     for (b = b0; b < b1; b++)
 626     {
 627         sol[b] *= blc[b];
 628     }
 629
 630     /* When constraining forces, we should not use mass weighting,
 631      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 632      */
 633     lincs_update_atoms(lincsd, th, 1.0, sol, r,
 634                        (econq != econqForce) ? invmass : NULL, fp);
 635
 636     if (bCalcDHDL)
 637     {
 638         real dhdlambda;
 639
 640         dhdlambda = 0;
 641         for (b = b0; b < b1; b++)
 642         {
 643             dhdlambda -= sol[b]*lincsd->ddist[b];
 644         }
 645
 646         lincsd->th[th].dhdlambda = dhdlambda;
 647     }
 648
 649     if (bCalcVir)
 650     {
 651         /* Constraint virial,
 652          * determines sum r_bond x delta f,
 653          * where delta f is the constraint correction
 654          * of the quantity that is being constrained.
 655          */
 656         for (b = b0; b < b1; b++)
 657         {
 658             real mvb, tmp1;
 659             int  i, j;
 660
 661             mvb = lincsd->bllen[b]*sol[b];
 662             for (i = 0; i < DIM; i++)
 663             {
 664                 tmp1 = mvb*r[b][i];
 665                 for (j = 0; j < DIM; j++)
 666                 {
 667                     rmdf[i][j] += tmp1*r[b][j];
 668                 }
 669             }
 670         } /* 23 ncons flops */
 671     }
 672 }
 673
 674 #ifdef LINCS_SIMD
 675 /* Calculate the constraint distance vectors r to project on from x.
 676  * Determine the right-hand side of the matrix equation using coordinates xp.
 677  */
 678 static void gmx_simdcall
 679 calc_dr_x_xp_simd(int                       b0,
 680                   int                       b1,
 681                   const int *               bla,
 682                   const rvec * gmx_restrict x,
 683                   const rvec * gmx_restrict xp,
 684                   const real * gmx_restrict bllen,
 685                   const real * gmx_restrict blc,
 686                   const pbc_simd_t *        pbc_simd,
 687                   real * gmx_restrict       vbuf1,
 688                   real * gmx_restrict       vbuf2,
 689                   rvec * gmx_restrict       r,
 690                   real * gmx_restrict       rhs,
 691                   real * gmx_restrict       sol)
 692 {
 693     int bs;
 694
 695     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 696
 697     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 698     {
 699         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
 700         gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 701
 702         rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
 703
 704         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 705
 706         n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
 707         il_S  = gmx_simd_invsqrt_r(n2_S);
 708
 709         rx_S  = gmx_simd_mul_r(rx_S, il_S);
 710         ry_S  = gmx_simd_mul_r(ry_S, il_S);
 711         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 712
 713         copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
 714
 715         rvec_dist_to_simd(xp, bs, bla, vbuf2, &rxp_S, &ryp_S, &rzp_S);
 716
 717         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 718
 719         ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
 720                                  rxp_S, ryp_S, rzp_S);
 721
 722         rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs),
 723                                gmx_simd_sub_r(ip_S, gmx_simd_load_r(bllen + bs)));
 724
 725         gmx_simd_store_r(rhs + bs, rhs_S);
 726         gmx_simd_store_r(sol + bs, rhs_S);
 727     }
 728 }
 729 #endif /* LINCS_SIMD */
 730
 731 /* Determine the distances and right-hand side for the next iteration */
 732 static void calc_dist_iter(int                       b0,
 733                            int                       b1,
 734                            const int *               bla,
 735                            const rvec * gmx_restrict xp,
 736                            const real * gmx_restrict bllen,
 737                            const real * gmx_restrict blc,
 738                            const t_pbc *             pbc,
 739                            real                      wfac,
 740                            real * gmx_restrict       rhs,
 741                            real * gmx_restrict       sol,
 742                            gmx_bool *                bWarn)
 743 {
 744     int b;
 745
 746     for (b = b0; b < b1; b++)
 747     {
 748         real len, len2, dlen2, mvb;
 749         rvec dx;
 750
 751         len = bllen[b];
 752         if (pbc)
 753         {
 754             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 755         }
 756         else
 757         {
 758             rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
 759         }
 760         len2  = len*len;
 761         dlen2 = 2*len2 - norm2(dx);
 762         if (dlen2 < wfac*len2)
 763         {
 764             /* not race free - see detailed comment in caller */
 765             *bWarn = TRUE;
 766         }
 767         if (dlen2 > 0)
 768         {
 769             mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
 770         }
 771         else
 772         {
 773             mvb = blc[b]*len;
 774         }
 775         rhs[b]  = mvb;
 776         sol[b]  = mvb;
 777     } /* 20*ncons flops */
 778 }
 779
 780 #ifdef LINCS_SIMD
 781 /* As the function above, but using SIMD intrinsics */
 782 static void gmx_simdcall
 783 calc_dist_iter_simd(int                       b0,
 784                     int                       b1,
 785                     const int *               bla,
 786                     const rvec * gmx_restrict x,
 787                     const real * gmx_restrict bllen,
 788                     const real * gmx_restrict blc,
 789                     const pbc_simd_t *        pbc_simd,
 790                     real                      wfac,
 791                     real * gmx_restrict       vbuf,
 792                     real * gmx_restrict       rhs,
 793                     real * gmx_restrict       sol,
 794                     gmx_bool *                bWarn)
 795 {
 796     gmx_simd_real_t min_S  = gmx_simd_set1_r(GMX_REAL_MIN);
 797     gmx_simd_real_t two_S  = gmx_simd_set1_r(2.0);
 798     gmx_simd_real_t wfac_S = gmx_simd_set1_r(wfac);
 799     gmx_simd_bool_t warn_S;
 800
 801     int             bs;
 802
 803     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 804
 805     /* Initialize all to FALSE */
 806     warn_S = gmx_simd_cmplt_r(two_S, gmx_simd_setzero_r());
 807
 808     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 809     {
 810         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
 811         gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
 812
 813         rvec_dist_to_simd(x, bs, bla, vbuf, &rx_S, &ry_S, &rz_S);
 814
 815         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 816
 817         n2_S    = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
 818
 819         len_S   = gmx_simd_load_r(bllen + bs);
 820         len2_S  = gmx_simd_mul_r(len_S, len_S);
 821
 822         dlen2_S = gmx_simd_fmsub_r(two_S, len2_S, n2_S);
 823
 824         warn_S  = gmx_simd_or_b(warn_S,
 825                                 gmx_simd_cmplt_r(dlen2_S,
 826                                                  gmx_simd_mul_r(wfac_S, len2_S)));
 827
 828         /* Avoid 1/0 by taking the max with REAL_MIN.
 829          * Note: when dlen2 is close to zero (90 degree constraint rotation),
 830          * the accuracy of the algorithm is no longer relevant.
 831          */
 832         dlen2_S = gmx_simd_max_r(dlen2_S, min_S);
 833
 834         lc_S    = gmx_simd_fnmadd_r(dlen2_S, gmx_simd_invsqrt_r(dlen2_S), len_S);
 835
 836         blc_S   = gmx_simd_load_r(blc + bs);
 837
 838         lc_S    = gmx_simd_mul_r(blc_S, lc_S);
 839
 840         gmx_simd_store_r(rhs + bs, lc_S);
 841         gmx_simd_store_r(sol + bs, lc_S);
 842     }
 843
 844     if (gmx_simd_anytrue_b(warn_S))
 845     {
 846         *bWarn = TRUE;
 847     }
 848 }
 849 #endif /* LINCS_SIMD */
 850
 851 static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
 852                      struct gmx_lincsdata *lincsd, int th,
 853                      const real *invmass,
 854                      t_commrec *cr,
 855                      gmx_bool bCalcDHDL,
 856                      real wangle, gmx_bool *bWarn,
 857                      real invdt, rvec * gmx_restrict v,
 858                      gmx_bool bCalcVir, tensor vir_r_m_dr)
 859 {
 860     int      b0, b1, b, i, j, n, iter;
 861     int     *bla, *blnr, *blbnb;
 862     rvec    *r;
 863     real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
 864     int     *nlocat;
 865
 866     b0 = lincsd->th[th].b0;
 867     b1 = lincsd->th[th].b1;
 868
 869     bla     = lincsd->bla;
 870     r       = lincsd->tmpv;
 871     blnr    = lincsd->blnr;
 872     blbnb   = lincsd->blbnb;
 873     blc     = lincsd->blc;
 874     blmf    = lincsd->blmf;
 875     bllen   = lincsd->bllen;
 876     blcc    = lincsd->tmpncc;
 877     rhs1    = lincsd->tmp1;
 878     rhs2    = lincsd->tmp2;
 879     sol     = lincsd->tmp3;
 880     blc_sol = lincsd->tmp4;
 881     mlambda = lincsd->mlambda;
 882
 883     if (DOMAINDECOMP(cr) && cr->dd->constraints)
 884     {
 885         nlocat = dd_constraints_nlocalatoms(cr->dd);
 886     }
 887     else
 888     {
 889         nlocat = NULL;
 890     }
 891
 892 #ifdef LINCS_SIMD
 893
 894     /* This SIMD code does the same as the plain-C code after the #else.
 895      * The only difference is that we always call pbc code, as with SIMD
 896      * the overhead of pbc computation (when not needed) is small.
 897      */
 898     pbc_simd_t pbc_simd;
 899
 900     /* Convert the pbc struct for SIMD */
 901     set_pbc_simd(pbc, &pbc_simd);
 902
 903     /* Compute normalized x i-j vectors, store in r.
 904      * Compute the inner product of r and xp i-j and store in rhs1.
 905      */
 906     calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
 907                       &pbc_simd,
 908                       lincsd->th[th].simd_buf,
 909                       lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
 910                       r, rhs1, sol);
 911
 912 #else /* LINCS_SIMD */
 913
 914     if (pbc)
 915     {
 916         /* Compute normalized i-j vectors */
 917         for (b = b0; b < b1; b++)
 918         {
 919             rvec dx;
 920             real mvb;
 921
 922             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 923             unitv(dx, r[b]);
 924
 925             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 926             mvb     = blc[b]*(iprod(r[b], dx) - bllen[b]);
 927             rhs1[b] = mvb;
 928             sol[b]  = mvb;
 929         }
 930     }
 931     else
 932     {
 933         /* Compute normalized i-j vectors */
 934         for (b = b0; b < b1; b++)
 935         {
 936             real tmp0, tmp1, tmp2, rlen, mvb;
 937
 938             i       = bla[2*b];
 939             j       = bla[2*b+1];
 940             tmp0    = x[i][0] - x[j][0];
 941             tmp1    = x[i][1] - x[j][1];
 942             tmp2    = x[i][2] - x[j][2];
 943             rlen    = gmx_invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
 944             r[b][0] = rlen*tmp0;
 945             r[b][1] = rlen*tmp1;
 946             r[b][2] = rlen*tmp2;
 947             /* 16 ncons flops */
 948
 949             i       = bla[2*b];
 950             j       = bla[2*b+1];
 951             mvb     = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
 952                               r[b][1]*(xp[i][1] - xp[j][1]) +
 953                               r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
 954             rhs1[b] = mvb;
 955             sol[b]  = mvb;
 956             /* 10 flops */
 957         }
 958         /* Together: 26*ncons + 6*nrtot flops */
 959     }
 960
 961 #endif /* LINCS_SIMD */
 962
 963     /* We need a barrier, since the matrix construction below
 964      * can access entries in r of other threads.
 965      */
 966 #pragma omp barrier
 967
 968     /* Construct the (sparse) LINCS matrix */
 969     for (b = b0; b < b1; b++)
 970     {
 971         for (n = blnr[b]; n < blnr[b+1]; n++)
 972         {
 973             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
 974         }
 975     }
 976     /* Together: 26*ncons + 6*nrtot flops */
 977
 978     lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
 979     /* nrec*(ncons+2*nrtot) flops */
 980
 981 #ifndef LINCS_SIMD
 982     for (b = b0; b < b1; b++)
 983     {
 984         mlambda[b] = blc[b]*sol[b];
 985     }
 986 #else
 987     for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
 988     {
 989         gmx_simd_store_r(mlambda + b,
 990                          gmx_simd_mul_r(gmx_simd_load_r(blc + b),
 991                                         gmx_simd_load_r(sol + b)));
 992     }
 993 #endif
 994
 995     /* Update the coordinates */
 996     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
 997
 998     /*
 999      ********  Correction for centripetal effects  ********
1000      */
1001
1002     real wfac;
1003
1004     wfac = cos(DEG2RAD*wangle);
1005     wfac = wfac*wfac;
1006
1007     for (iter = 0; iter < lincsd->nIter; iter++)
1008     {
1009         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1010         {
1011 #pragma omp barrier
1012 #pragma omp master
1013             {
1014                 /* Communicate the corrected non-local coordinates */
1015                 if (DOMAINDECOMP(cr))
1016                 {
1017                     dd_move_x_constraints(cr->dd, box, xp, NULL, FALSE);
1018                 }
1019             }
1020         }
1021
1022 #pragma omp barrier
1023
1024 #ifdef LINCS_SIMD
1025         calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
1026                             lincsd->th[th].simd_buf, rhs1, sol, bWarn);
1027 #else
1028         calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
1029                        rhs1, sol, bWarn);
1030         /* 20*ncons flops */
1031 #endif
1032
1033         lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
1034         /* nrec*(ncons+2*nrtot) flops */
1035
1036 #ifndef LINCS_SIMD
1037         for (b = b0; b < b1; b++)
1038         {
1039             real mvb;
1040
1041             mvb         = blc[b]*sol[b];
1042             blc_sol[b]  = mvb;
1043             mlambda[b] += mvb;
1044         }
1045 #else
1046         for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1047         {
1048             gmx_simd_real_t mvb;
1049
1050             mvb = gmx_simd_mul_r(gmx_simd_load_r(blc + b),
1051                                  gmx_simd_load_r(sol + b));
1052             gmx_simd_store_r(blc_sol + b, mvb);
1053             gmx_simd_store_r(mlambda + b,
1054                              gmx_simd_add_r(gmx_simd_load_r(mlambda + b), mvb));
1055         }
1056 #endif
1057
1058         /* Update the coordinates */
1059         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1060     }
1061     /* nit*ncons*(37+9*nrec) flops */
1062
1063     if (v != NULL)
1064     {
1065         /* Update the velocities */
1066         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1067         /* 16 ncons flops */
1068     }
1069
1070     if (nlocat != NULL && (bCalcDHDL || bCalcVir))
1071     {
1072         /* In lincs_update_atoms threads might cross-read mlambda */
1073 #pragma omp barrier
1074
1075         /* Only account for local atoms */
1076         for (b = b0; b < b1; b++)
1077         {
1078             mlambda[b] *= 0.5*nlocat[b];
1079         }
1080     }
1081
1082     if (bCalcDHDL)
1083     {
1084         real dhdl;
1085
1086         dhdl = 0;
1087         for (b = b0; b < b1; b++)
1088         {
1089             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1090              * later after the contributions are reduced over the threads.
1091              */
1092             dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
1093         }
1094         lincsd->th[th].dhdlambda = dhdl;
1095     }
1096
1097     if (bCalcVir)
1098     {
1099         /* Constraint virial */
1100         for (b = b0; b < b1; b++)
1101         {
1102             real tmp0, tmp1;
1103
1104             tmp0 = -bllen[b]*mlambda[b];
1105             for (i = 0; i < DIM; i++)
1106             {
1107                 tmp1 = tmp0*r[b][i];
1108                 for (j = 0; j < DIM; j++)
1109                 {
1110                     vir_r_m_dr[i][j] -= tmp1*r[b][j];
1111                 }
1112             }
1113         } /* 22 ncons flops */
1114     }
1115
1116     /* Total:
1117      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1118      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1119      *
1120      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1121      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1122      * if nit=1
1123      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1124      */
1125 }
1126
1127 void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
1128 {
1129     int        i, a1, a2, n, k, sign, center;
1130     int        end, nk, kk;
1131     const real invsqrt2 = 0.7071067811865475244;
1132
1133     for (i = 0; (i < li->nc); i++)
1134     {
1135         a1          = li->bla[2*i];
1136         a2          = li->bla[2*i+1];
1137         li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
1138         li->blc1[i] = invsqrt2;
1139     }
1140
1141     /* Construct the coupling coefficient matrix blmf */
1142     li->ntriangle    = 0;
1143     li->ncc_triangle = 0;
1144     for (i = 0; (i < li->nc); i++)
1145     {
1146         a1 = li->bla[2*i];
1147         a2 = li->bla[2*i+1];
1148         for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
1149         {
1150             k = li->blbnb[n];
1151             if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
1152             {
1153                 sign = -1;
1154             }
1155             else
1156             {
1157                 sign = 1;
1158             }
1159             if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
1160             {
1161                 center = a1;
1162                 end    = a2;
1163             }
1164             else
1165             {
1166                 center = a2;
1167                 end    = a1;
1168             }
1169             li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
1170             li->blmf1[n] = sign*0.5;
1171             if (li->ncg_triangle > 0)
1172             {
1173                 /* Look for constraint triangles */
1174                 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
1175                 {
1176                     kk = li->blbnb[nk];
1177                     if (kk != i && kk != k &&
1178                         (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
1179                     {
1180                         if (li->ntriangle == 0 ||
1181                             li->triangle[li->ntriangle-1] < i)
1182                         {
1183                             /* Add this constraint to the triangle list */
1184                             li->triangle[li->ntriangle] = i;
1185                             li->tri_bits[li->ntriangle] = 0;
1186                             li->ntriangle++;
1187                             if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li->tri_bits[0])*8 - 1))
1188                             {
1189                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1190                                           li->blnr[i+1] - li->blnr[i],
1191                                           sizeof(li->tri_bits[0])*8-1);
1192                             }
1193                         }
1194                         li->tri_bits[li->ntriangle-1] |= (1<<(n-li->blnr[i]));
1195                         li->ncc_triangle++;
1196                     }
1197                 }
1198             }
1199         }
1200     }
1201
1202     if (debug)
1203     {
1204         fprintf(debug, "Of the %d constraints %d participate in triangles\n",
1205                 li->nc, li->ntriangle);
1206         fprintf(debug, "There are %d couplings of which %d in triangles\n",
1207                 li->ncc, li->ncc_triangle);
1208     }
1209
1210     /* Set matlam,
1211      * so we know with which lambda value the masses have been set.
1212      */
1213     li->matlam = lambda;
1214 }
1215
1216 static int count_triangle_constraints(t_ilist *ilist, t_blocka *at2con)
1217 {
1218     int      ncon1, ncon_tot;
1219     int      c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
1220     int      ncon_triangle;
1221     gmx_bool bTriangle;
1222     t_iatom *ia1, *ia2, *iap;
1223
1224     ncon1    = ilist[F_CONSTR].nr/3;
1225     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1226
1227     ia1 = ilist[F_CONSTR].iatoms;
1228     ia2 = ilist[F_CONSTRNC].iatoms;
1229
1230     ncon_triangle = 0;
1231     for (c0 = 0; c0 < ncon_tot; c0++)
1232     {
1233         bTriangle = FALSE;
1234         iap       = constr_iatomptr(ncon1, ia1, ia2, c0);
1235         a00       = iap[1];
1236         a01       = iap[2];
1237         for (n1 = at2con->index[a01]; n1 < at2con->index[a01+1]; n1++)
1238         {
1239             c1 = at2con->a[n1];
1240             if (c1 != c0)
1241             {
1242                 iap = constr_iatomptr(ncon1, ia1, ia2, c1);
1243                 a10 = iap[1];
1244                 a11 = iap[2];
1245                 if (a10 == a01)
1246                 {
1247                     ac1 = a11;
1248                 }
1249                 else
1250                 {
1251                     ac1 = a10;
1252                 }
1253                 for (n2 = at2con->index[ac1]; n2 < at2con->index[ac1+1]; n2++)
1254                 {
1255                     c2 = at2con->a[n2];
1256                     if (c2 != c0 && c2 != c1)
1257                     {
1258                         iap = constr_iatomptr(ncon1, ia1, ia2, c2);
1259                         a20 = iap[1];
1260                         a21 = iap[2];
1261                         if (a20 == a00 || a21 == a00)
1262                         {
1263                             bTriangle = TRUE;
1264                         }
1265                     }
1266                 }
1267             }
1268         }
1269         if (bTriangle)
1270         {
1271             ncon_triangle++;
1272         }
1273     }
1274
1275     return ncon_triangle;
1276 }
1277
1278 static gmx_bool more_than_two_sequential_constraints(const t_ilist  *ilist,
1279                                                      const t_blocka *at2con)
1280 {
1281     t_iatom  *ia1, *ia2, *iap;
1282     int       ncon1, ncon_tot, c;
1283     int       a1, a2;
1284     gmx_bool  bMoreThanTwoSequentialConstraints;
1285
1286     ncon1    = ilist[F_CONSTR].nr/3;
1287     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1288
1289     ia1 = ilist[F_CONSTR].iatoms;
1290     ia2 = ilist[F_CONSTRNC].iatoms;
1291
1292     bMoreThanTwoSequentialConstraints = FALSE;
1293     for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
1294     {
1295         iap = constr_iatomptr(ncon1, ia1, ia2, c);
1296         a1  = iap[1];
1297         a2  = iap[2];
1298         /* Check if this constraint has constraints connected at both atoms */
1299         if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
1300             at2con->index[a2+1] - at2con->index[a2] > 1)
1301         {
1302             bMoreThanTwoSequentialConstraints = TRUE;
1303         }
1304     }
1305
1306     return bMoreThanTwoSequentialConstraints;
1307 }
1308
1309 static int int_comp(const void *a, const void *b)
1310 {
1311     return (*(int *)a) - (*(int *)b);
1312 }
1313
1314 gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
1315                            int nflexcon_global, t_blocka *at2con,
1316                            gmx_bool bPLINCS, int nIter, int nProjOrder)
1317 {
1318     struct gmx_lincsdata *li;
1319     int                   mb;
1320     gmx_moltype_t        *molt;
1321
1322     if (fplog)
1323     {
1324         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
1325                 bPLINCS ? " Parallel" : "");
1326     }
1327
1328     snew(li, 1);
1329
1330     li->ncg      =
1331         gmx_mtop_ftype_count(mtop, F_CONSTR) +
1332         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1333     li->ncg_flex = nflexcon_global;
1334
1335     li->nIter  = nIter;
1336     li->nOrder = nProjOrder;
1337
1338     li->ncg_triangle = 0;
1339     li->bCommIter    = FALSE;
1340     for (mb = 0; mb < mtop->nmolblock; mb++)
1341     {
1342         molt              = &mtop->moltype[mtop->molblock[mb].type];
1343         li->ncg_triangle +=
1344             mtop->molblock[mb].nmol*
1345             count_triangle_constraints(molt->ilist,
1346                                        &at2con[mtop->molblock[mb].type]);
1347         if (bPLINCS && li->bCommIter == FALSE)
1348         {
1349             /* Check if we need to communicate not only before LINCS,
1350              * but also before each iteration.
1351              * The check for only two sequential constraints is only
1352              * useful for the common case of H-bond only constraints.
1353              * With more effort we could also make it useful for small
1354              * molecules with nr. sequential constraints <= nOrder-1.
1355              */
1356             li->bCommIter = (li->nOrder < 1 || more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]));
1357         }
1358     }
1359     if (debug && bPLINCS)
1360     {
1361         fprintf(debug, "PLINCS communication before each iteration: %d\n",
1362                 li->bCommIter);
1363     }
1364
1365     /* LINCS can run on any number of threads.
1366      * Currently the number is fixed for the whole simulation,
1367      * but it could be set in set_lincs().
1368      */
1369     li->nth = gmx_omp_nthreads_get(emntLINCS);
1370     if (debug)
1371     {
1372         fprintf(debug, "LINCS: using %d threads\n", li->nth);
1373     }
1374     if (li->nth == 1)
1375     {
1376         snew(li->th, 1);
1377     }
1378     else
1379     {
1380         /* Allocate an extra elements for "thread-overlap" constraints */
1381         snew(li->th, li->nth+1);
1382     }
1383     int th;
1384 #pragma omp parallel for num_threads(li->nth)
1385     for (th = 0; th < li->nth; th++)
1386     {
1387         /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
1388         snew_aligned(li->th[th].simd_buf, 2*simd_width*DIM,
1389                      align_bytes);
1390     }
1391
1392     if (bPLINCS || li->ncg_triangle > 0)
1393     {
1394         please_cite(fplog, "Hess2008a");
1395     }
1396     else
1397     {
1398         please_cite(fplog, "Hess97a");
1399     }
1400
1401     if (fplog)
1402     {
1403         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1404         if (bPLINCS)
1405         {
1406             fprintf(fplog, "There are inter charge-group constraints,\n"
1407                     "will communicate selected coordinates each lincs iteration\n");
1408         }
1409         if (li->ncg_triangle > 0)
1410         {
1411             fprintf(fplog,
1412                     "%d constraints are involved in constraint triangles,\n"
1413                     "will apply an additional matrix expansion of order %d for couplings\n"
1414                     "between constraints inside triangles\n",
1415                     li->ncg_triangle, li->nOrder);
1416         }
1417     }
1418
1419     return li;
1420 }
1421
1422 /* Sets up the work division over the threads */
1423 static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
1424 {
1425     lincs_thread_t *li_m;
1426     int             th;
1427     gmx_bitmask_t  *atf;
1428     int             a;
1429
1430     if (natoms > li->atf_nalloc)
1431     {
1432         li->atf_nalloc = over_alloc_large(natoms);
1433         srenew(li->atf, li->atf_nalloc);
1434     }
1435
1436     atf = li->atf;
1437     /* Clear the atom flags */
1438     for (a = 0; a < natoms; a++)
1439     {
1440         bitmask_clear(&atf[a]);
1441     }
1442
1443     if (li->nth > BITMASK_SIZE)
1444     {
1445         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1446     }
1447
1448     int block = simd_width;
1449
1450     for (th = 0; th < li->nth; th++)
1451     {
1452         lincs_thread_t *li_th;
1453         int             b;
1454
1455         li_th = &li->th[th];
1456
1457         /* The constraints are divided equally over the threads,
1458          * in chunks of size block to ensure alignment for SIMD.
1459          */
1460         li_th->b0 = (((li->nc + block - 1)* th   )/(block*li->nth))*block;
1461         li_th->b1 = (((li->nc + block - 1)*(th+1))/(block*li->nth))*block;
1462         li_th->b1 = std::min<int>(li_th->b1, li->nc);
1463
1464         /* For each atom set a flag for constraints from each */
1465         for (b = li_th->b0; b < li_th->b1; b++)
1466         {
1467             bitmask_set_bit(&atf[li->bla[b*2]], th);
1468             bitmask_set_bit(&atf[li->bla[b*2+1]], th);
1469         }
1470     }
1471
1472 #pragma omp parallel for num_threads(li->nth) schedule(static)
1473     for (th = 0; th < li->nth; th++)
1474     {
1475         lincs_thread_t *li_th;
1476         gmx_bitmask_t   mask;
1477         int             b;
1478
1479         li_th = &li->th[th];
1480
1481         if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
1482         {
1483             li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
1484             srenew(li_th->ind, li_th->ind_nalloc);
1485             srenew(li_th->ind_r, li_th->ind_nalloc);
1486         }
1487
1488         bitmask_init_low_bits(&mask, th);
1489
1490         li_th->nind   = 0;
1491         li_th->nind_r = 0;
1492         for (b = li_th->b0; b < li_th->b1; b++)
1493         {
1494             /* We let the constraint with the lowest thread index
1495              * operate on atoms with constraints from multiple threads.
1496              */
1497             if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
1498                 bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
1499             {
1500                 /* Add the constraint to the local atom update index */
1501                 li_th->ind[li_th->nind++] = b;
1502             }
1503             else
1504             {
1505                 /* Add the constraint to the rest block */
1506                 li_th->ind_r[li_th->nind_r++] = b;
1507             }
1508         }
1509     }
1510
1511     /* We need to copy all constraints which have not be assigned
1512      * to a thread to a separate list which will be handled by one thread.
1513      */
1514     li_m = &li->th[li->nth];
1515
1516     li_m->nind = 0;
1517     for (th = 0; th < li->nth; th++)
1518     {
1519         lincs_thread_t *li_th;
1520         int             b;
1521
1522         li_th   = &li->th[th];
1523
1524         if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
1525         {
1526             li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
1527             srenew(li_m->ind, li_m->ind_nalloc);
1528         }
1529
1530         for (b = 0; b < li_th->nind_r; b++)
1531         {
1532             li_m->ind[li_m->nind++] = li_th->ind_r[b];
1533         }
1534
1535         if (debug)
1536         {
1537             fprintf(debug, "LINCS thread %d: %d constraints\n",
1538                     th, li_th->nind);
1539         }
1540     }
1541
1542     if (debug)
1543     {
1544         fprintf(debug, "LINCS thread r: %d constraints\n",
1545                 li_m->nind);
1546     }
1547 }
1548
1549 /* There is no realloc with alignment, so here we make one for reals.
1550  * Note that this function does not preserve the contents of the memory.
1551  */
1552 static void resize_real_aligned(real **ptr, int nelem)
1553 {
1554     sfree_aligned(*ptr);
1555     snew_aligned(*ptr, nelem, align_bytes);
1556 }
1557
1558 void set_lincs(t_idef *idef, t_mdatoms *md,
1559                gmx_bool bDynamics, t_commrec *cr,
1560                struct gmx_lincsdata *li)
1561 {
1562     int          start, natoms, nflexcon;
1563     t_blocka     at2con;
1564     t_iatom     *iatom;
1565     int          i, k, ncc_alloc, ncon_tot, con, nconnect, concon;
1566     int          type, a1, a2;
1567     real         lenA = 0, lenB;
1568
1569     li->nc  = 0;
1570     li->ncc = 0;
1571     /* Zero the thread index ranges.
1572      * Otherwise without local constraints we could return with old ranges.
1573      */
1574     for (i = 0; i < li->nth; i++)
1575     {
1576         li->th[i].b0   = 0;
1577         li->th[i].b1   = 0;
1578         li->th[i].nind = 0;
1579     }
1580     if (li->nth > 1)
1581     {
1582         li->th[li->nth].nind = 0;
1583     }
1584
1585     /* This is the local topology, so there are only F_CONSTR constraints */
1586     if (idef->il[F_CONSTR].nr == 0)
1587     {
1588         /* There are no constraints,
1589          * we do not need to fill any data structures.
1590          */
1591         return;
1592     }
1593
1594     if (debug)
1595     {
1596         fprintf(debug, "Building the LINCS connectivity\n");
1597     }
1598
1599     if (DOMAINDECOMP(cr))
1600     {
1601         if (cr->dd->constraints)
1602         {
1603             dd_get_constraint_range(cr->dd, &start, &natoms);
1604         }
1605         else
1606         {
1607             natoms = cr->dd->nat_home;
1608         }
1609         start = 0;
1610     }
1611     else
1612     {
1613         start  = 0;
1614         natoms = md->homenr;
1615     }
1616     at2con = make_at2con(start, natoms, idef->il, idef->iparams, bDynamics,
1617                          &nflexcon);
1618
1619     ncon_tot = idef->il[F_CONSTR].nr/3;
1620
1621     /* Ensure we have enough space for aligned loads beyond ncon_tot */
1622     if (ncon_tot + simd_width > li->nc_alloc || li->nc_alloc == 0)
1623     {
1624         li->nc_alloc = over_alloc_dd(ncon_tot + simd_width);
1625         resize_real_aligned(&li->bllen0, li->nc_alloc);
1626         resize_real_aligned(&li->ddist, li->nc_alloc);
1627         srenew(li->bla, 2*li->nc_alloc);
1628         resize_real_aligned(&li->blc, li->nc_alloc);
1629         resize_real_aligned(&li->blc1, li->nc_alloc);
1630         srenew(li->blnr, li->nc_alloc+1);
1631         resize_real_aligned(&li->bllen, li->nc_alloc);
1632         srenew(li->tmpv, li->nc_alloc);
1633         resize_real_aligned(&li->tmp1, li->nc_alloc);
1634         resize_real_aligned(&li->tmp2, li->nc_alloc);
1635         resize_real_aligned(&li->tmp3, li->nc_alloc);
1636         resize_real_aligned(&li->tmp4, li->nc_alloc);
1637         resize_real_aligned(&li->mlambda, li->nc_alloc);
1638         if (li->ncg_triangle > 0)
1639         {
1640             /* This is allocating too much, but it is difficult to improve */
1641             srenew(li->triangle, li->nc_alloc);
1642             srenew(li->tri_bits, li->nc_alloc);
1643         }
1644     }
1645
1646     iatom = idef->il[F_CONSTR].iatoms;
1647
1648     ncc_alloc   = li->ncc_alloc;
1649     li->blnr[0] = 0;
1650
1651     con           = 0;
1652     nconnect      = 0;
1653     li->blnr[con] = nconnect;
1654     for (i = 0; i < ncon_tot; i++)
1655     {
1656         type   = iatom[3*i];
1657         a1     = iatom[3*i+1];
1658         a2     = iatom[3*i+2];
1659         lenA   = idef->iparams[type].constr.dA;
1660         lenB   = idef->iparams[type].constr.dB;
1661         /* Skip the flexible constraints when not doing dynamics */
1662         if (bDynamics || lenA != 0 || lenB != 0)
1663         {
1664             li->bllen0[con]  = lenA;
1665             li->ddist[con]   = lenB - lenA;
1666             /* Set the length to the topology A length */
1667             li->bllen[con]   = li->bllen0[con];
1668             li->bla[2*con]   = a1;
1669             li->bla[2*con+1] = a2;
1670             /* Construct the constraint connection matrix blbnb */
1671             for (k = at2con.index[a1-start]; k < at2con.index[a1-start+1]; k++)
1672             {
1673                 concon = at2con.a[k];
1674                 if (concon != i)
1675                 {
1676                     if (nconnect >= ncc_alloc)
1677                     {
1678                         ncc_alloc = over_alloc_small(nconnect+1);
1679                         srenew(li->blbnb, ncc_alloc);
1680                     }
1681                     li->blbnb[nconnect++] = concon;
1682                 }
1683             }
1684             for (k = at2con.index[a2-start]; k < at2con.index[a2-start+1]; k++)
1685             {
1686                 concon = at2con.a[k];
1687                 if (concon != i)
1688                 {
1689                     if (nconnect+1 > ncc_alloc)
1690                     {
1691                         ncc_alloc = over_alloc_small(nconnect+1);
1692                         srenew(li->blbnb, ncc_alloc);
1693                     }
1694                     li->blbnb[nconnect++] = concon;
1695                 }
1696             }
1697             li->blnr[con+1] = nconnect;
1698
1699             if (cr->dd == NULL)
1700             {
1701                 /* Order the blbnb matrix to optimize memory access */
1702                 qsort(&(li->blbnb[li->blnr[con]]), li->blnr[con+1]-li->blnr[con],
1703                       sizeof(li->blbnb[0]), int_comp);
1704             }
1705             /* Increase the constraint count */
1706             con++;
1707         }
1708     }
1709     if (simd_width > 1)
1710     {
1711         /* Copy the last atom pair indices and lengths for constraints
1712          * up to a multiple of simd_width, such that we can do all
1713          * SIMD operations without have to worry about end effects.
1714          */
1715         int con_roundup, i;
1716
1717         con_roundup = ((con + simd_width - 1)/simd_width)*simd_width;
1718         for (i = con; i < con_roundup; i++)
1719         {
1720             li->bla[i*2    ] = li->bla[(con - 1)*2    ];
1721             li->bla[i*2 + 1] = li->bla[(con - 1)*2 + 1];
1722             li->bllen[i]     = li->bllen[con - 1];
1723         }
1724     }
1725
1726     done_blocka(&at2con);
1727
1728     /* This is the real number of constraints,
1729      * without dynamics the flexible constraints are not present.
1730      */
1731     li->nc = con;
1732
1733     li->ncc = li->blnr[con];
1734     if (cr->dd == NULL)
1735     {
1736         /* Since the matrix is static, we can free some memory */
1737         ncc_alloc = li->ncc;
1738         srenew(li->blbnb, ncc_alloc);
1739     }
1740
1741     if (ncc_alloc > li->ncc_alloc)
1742     {
1743         li->ncc_alloc = ncc_alloc;
1744         srenew(li->blmf, li->ncc_alloc);
1745         srenew(li->blmf1, li->ncc_alloc);
1746         srenew(li->tmpncc, li->ncc_alloc);
1747     }
1748
1749     if (debug)
1750     {
1751         fprintf(debug, "Number of constraints is %d, couplings %d\n",
1752                 li->nc, li->ncc);
1753     }
1754
1755     if (li->nth == 1)
1756     {
1757         li->th[0].b0 = 0;
1758         li->th[0].b1 = li->nc;
1759     }
1760     else
1761     {
1762         lincs_thread_setup(li, md->nr);
1763     }
1764
1765     set_lincs_matrix(li, md->invmass, md->lambda);
1766 }
1767
1768 static void lincs_warning(FILE *fplog,
1769                           gmx_domdec_t *dd, rvec *x, rvec *xprime, t_pbc *pbc,
1770                           int ncons, int *bla, real *bllen, real wangle,
1771                           int maxwarn, int *warncount)
1772 {
1773     int  b, i, j;
1774     rvec v0, v1;
1775     real wfac, d0, d1, cosine;
1776     char buf[STRLEN];
1777
1778     wfac = cos(DEG2RAD*wangle);
1779
1780     sprintf(buf, "bonds that rotated more than %g degrees:\n"
1781             " atom 1 atom 2  angle  previous, current, constraint length\n",
1782             wangle);
1783     fprintf(stderr, "%s", buf);
1784     if (fplog)
1785     {
1786         fprintf(fplog, "%s", buf);
1787     }
1788
1789     for (b = 0; b < ncons; b++)
1790     {
1791         i = bla[2*b];
1792         j = bla[2*b+1];
1793         if (pbc)
1794         {
1795             pbc_dx_aiuc(pbc, x[i], x[j], v0);
1796             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
1797         }
1798         else
1799         {
1800             rvec_sub(x[i], x[j], v0);
1801             rvec_sub(xprime[i], xprime[j], v1);
1802         }
1803         d0     = norm(v0);
1804         d1     = norm(v1);
1805         cosine = iprod(v0, v1)/(d0*d1);
1806         if (cosine < wfac)
1807         {
1808             sprintf(buf, " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
1809                     ddglatnr(dd, i), ddglatnr(dd, j),
1810                     RAD2DEG*acos(cosine), d0, d1, bllen[b]);
1811             fprintf(stderr, "%s", buf);
1812             if (fplog)
1813             {
1814                 fprintf(fplog, "%s", buf);
1815             }
1816             if (!gmx_isfinite(d1))
1817             {
1818                 gmx_fatal(FARGS, "Bond length not finite.");
1819             }
1820
1821             (*warncount)++;
1822         }
1823     }
1824     if (*warncount > maxwarn)
1825     {
1826         too_many_constraint_warnings(econtLINCS, *warncount);
1827     }
1828 }
1829
1830 static void cconerr(gmx_domdec_t *dd,
1831                     int ncons, int *bla, real *bllen, rvec *x, t_pbc *pbc,
1832                     real *ncons_loc, real *ssd, real *max, int *imax)
1833 {
1834     real       len, d, ma, ssd2, r2;
1835     int       *nlocat, count, b, im;
1836     rvec       dx;
1837
1838     if (dd && dd->constraints)
1839     {
1840         nlocat = dd_constraints_nlocalatoms(dd);
1841     }
1842     else
1843     {
1844         nlocat = 0;
1845     }
1846
1847     ma    = 0;
1848     ssd2  = 0;
1849     im    = 0;
1850     count = 0;
1851     for (b = 0; b < ncons; b++)
1852     {
1853         if (pbc)
1854         {
1855             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
1856         }
1857         else
1858         {
1859             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
1860         }
1861         r2  = norm2(dx);
1862         len = r2*gmx_invsqrt(r2);
1863         d   = fabs(len/bllen[b]-1);
1864         if (d > ma && (nlocat == NULL || nlocat[b]))
1865         {
1866             ma = d;
1867             im = b;
1868         }
1869         if (nlocat == NULL)
1870         {
1871             ssd2 += d*d;
1872             count++;
1873         }
1874         else
1875         {
1876             ssd2  += nlocat[b]*d*d;
1877             count += nlocat[b];
1878         }
1879     }
1880
1881     *ncons_loc = (nlocat ? 0.5 : 1)*count;
1882     *ssd       = (nlocat ? 0.5 : 1)*ssd2;
1883     *max       = ma;
1884     *imax      = im;
1885 }
1886
1887 gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
1888                          t_inputrec *ir,
1889                          gmx_int64_t step,
1890                          struct gmx_lincsdata *lincsd, t_mdatoms *md,
1891                          t_commrec *cr,
1892                          rvec *x, rvec *xprime, rvec *min_proj,
1893                          matrix box, t_pbc *pbc,
1894                          real lambda, real *dvdlambda,
1895                          real invdt, rvec *v,
1896                          gmx_bool bCalcVir, tensor vir_r_m_dr,
1897                          int econq,
1898                          t_nrnb *nrnb,
1899                          int maxwarn, int *warncount)
1900 {
1901     gmx_bool  bCalcDHDL;
1902     char      buf[STRLEN], buf2[22], buf3[STRLEN];
1903     int       i, p_imax;
1904     real      ncons_loc, p_ssd, p_max = 0;
1905     rvec      dx;
1906     gmx_bool  bOK, bWarn;
1907
1908     bOK = TRUE;
1909
1910     /* This boolean should be set by a flag passed to this routine.
1911      * We can also easily check if any constraint length is changed,
1912      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
1913      */
1914     bCalcDHDL = (ir->efep != efepNO && dvdlambda != NULL);
1915
1916     if (lincsd->nc == 0 && cr->dd == NULL)
1917     {
1918         if (bLog || bEner)
1919         {
1920             lincsd->rmsd_data[0] = 0;
1921             if (ir->eI == eiSD2 && v == NULL)
1922             {
1923                 i = 2;
1924             }
1925             else
1926             {
1927                 i = 1;
1928             }
1929             lincsd->rmsd_data[i] = 0;
1930         }
1931
1932         return bOK;
1933     }
1934
1935     if (econq == econqCoord)
1936     {
1937         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
1938          * also with efep!=fepNO.
1939          */
1940         if (ir->efep != efepNO)
1941         {
1942             if (md->nMassPerturbed && lincsd->matlam != md->lambda)
1943             {
1944                 set_lincs_matrix(lincsd, md->invmass, md->lambda);
1945             }
1946
1947             for (i = 0; i < lincsd->nc; i++)
1948             {
1949                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
1950             }
1951         }
1952
1953         if (lincsd->ncg_flex)
1954         {
1955             /* Set the flexible constraint lengths to the old lengths */
1956             if (pbc != NULL)
1957             {
1958                 for (i = 0; i < lincsd->nc; i++)
1959                 {
1960                     if (lincsd->bllen[i] == 0)
1961                     {
1962                         pbc_dx_aiuc(pbc, x[lincsd->bla[2*i]], x[lincsd->bla[2*i+1]], dx);
1963                         lincsd->bllen[i] = norm(dx);
1964                     }
1965                 }
1966             }
1967             else
1968             {
1969                 for (i = 0; i < lincsd->nc; i++)
1970                 {
1971                     if (lincsd->bllen[i] == 0)
1972                     {
1973                         lincsd->bllen[i] =
1974                             sqrt(distance2(x[lincsd->bla[2*i]],
1975                                            x[lincsd->bla[2*i+1]]));
1976                     }
1977                 }
1978             }
1979         }
1980
1981         if (bLog && fplog)
1982         {
1983             cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
1984                     &ncons_loc, &p_ssd, &p_max, &p_imax);
1985         }
1986
1987         /* This bWarn var can be updated by multiple threads
1988          * at the same time. But as we only need to detect
1989          * if a warning occured or not, this is not an issue.
1990          */
1991         bWarn = FALSE;
1992
1993         /* The OpenMP parallel region of constrain_lincs for coords */
1994 #pragma omp parallel num_threads(lincsd->nth)
1995         {
1996             int th = gmx_omp_get_thread_num();
1997
1998             clear_mat(lincsd->th[th].vir_r_m_dr);
1999
2000             do_lincs(x, xprime, box, pbc, lincsd, th,
2001                      md->invmass, cr,
2002                      bCalcDHDL,
2003                      ir->LincsWarnAngle, &bWarn,
2004                      invdt, v, bCalcVir,
2005                      th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
2006         }
2007
2008         if (bLog && fplog && lincsd->nc > 0)
2009         {
2010             fprintf(fplog, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2011             fprintf(fplog, "       Before LINCS          %.6f    %.6f %6d %6d\n",
2012                     sqrt(p_ssd/ncons_loc), p_max,
2013                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2014                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2015         }
2016         if (bLog || bEner)
2017         {
2018             cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
2019                     &ncons_loc, &p_ssd, &p_max, &p_imax);
2020             /* Check if we are doing the second part of SD */
2021             if (ir->eI == eiSD2 && v == NULL)
2022             {
2023                 i = 2;
2024             }
2025             else
2026             {
2027                 i = 1;
2028             }
2029             lincsd->rmsd_data[0] = ncons_loc;
2030             lincsd->rmsd_data[i] = p_ssd;
2031         }
2032         else
2033         {
2034             lincsd->rmsd_data[0] = 0;
2035             lincsd->rmsd_data[1] = 0;
2036             lincsd->rmsd_data[2] = 0;
2037         }
2038         if (bLog && fplog && lincsd->nc > 0)
2039         {
2040             fprintf(fplog,
2041                     "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2042                     sqrt(p_ssd/ncons_loc), p_max,
2043                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2044                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2045         }
2046
2047         if (bWarn)
2048         {
2049             if (maxwarn >= 0)
2050             {
2051                 cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
2052                         &ncons_loc, &p_ssd, &p_max, &p_imax);
2053                 if (MULTISIM(cr))
2054                 {
2055                     sprintf(buf3, " in simulation %d", cr->ms->sim);
2056                 }
2057                 else
2058                 {
2059                     buf3[0] = 0;
2060                 }
2061                 sprintf(buf, "\nStep %s, time %g (ps)  LINCS WARNING%s\n"
2062                         "relative constraint deviation after LINCS:\n"
2063                         "rms %.6f, max %.6f (between atoms %d and %d)\n",
2064                         gmx_step_str(step, buf2), ir->init_t+step*ir->delta_t,
2065                         buf3,
2066                         sqrt(p_ssd/ncons_loc), p_max,
2067                         ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2068                         ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2069                 if (fplog)
2070                 {
2071                     fprintf(fplog, "%s", buf);
2072                 }
2073                 fprintf(stderr, "%s", buf);
2074                 lincs_warning(fplog, cr->dd, x, xprime, pbc,
2075                               lincsd->nc, lincsd->bla, lincsd->bllen,
2076                               ir->LincsWarnAngle, maxwarn, warncount);
2077             }
2078             bOK = (p_max < 0.5);
2079         }
2080
2081         if (lincsd->ncg_flex)
2082         {
2083             for (i = 0; (i < lincsd->nc); i++)
2084             {
2085                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2086                 {
2087                     lincsd->bllen[i] = 0;
2088                 }
2089             }
2090         }
2091     }
2092     else
2093     {
2094         /* The OpenMP parallel region of constrain_lincs for derivatives */
2095 #pragma omp parallel num_threads(lincsd->nth)
2096         {
2097             int th = gmx_omp_get_thread_num();
2098
2099             do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
2100                       md->invmass, econq, bCalcDHDL,
2101                       bCalcVir, th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
2102         }
2103     }
2104
2105     if (bCalcDHDL)
2106     {
2107         /* Reduce the dH/dlambda contributions over the threads */
2108         real dhdlambda;
2109         int  th;
2110
2111         dhdlambda = 0;
2112         for (th = 0; th < lincsd->nth; th++)
2113         {
2114             dhdlambda += lincsd->th[th].dhdlambda;
2115         }
2116         if (econqCoord)
2117         {
2118             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2119             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2120             dhdlambda /= ir->delta_t*ir->delta_t;
2121         }
2122         *dvdlambda += dhdlambda;
2123     }
2124
2125     if (bCalcVir && lincsd->nth > 1)
2126     {
2127         for (i = 1; i < lincsd->nth; i++)
2128         {
2129             m_add(vir_r_m_dr, lincsd->th[i].vir_r_m_dr, vir_r_m_dr);
2130         }
2131     }
2132
2133     /* count assuming nit=1 */
2134     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc);
2135     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
2136     if (lincsd->ntriangle > 0)
2137     {
2138         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
2139     }
2140     if (v)
2141     {
2142         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc*2);
2143     }
2144     if (bCalcVir)
2145     {
2146         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc);
2147     }
2148
2149     return bOK;
2150 }