src/gromacs/mdlib/clincs.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15  *
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 /* This file is completely threadsafe - keep it that way! */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include "main.h"
  43 #include "constr.h"
  44 #include "copyrite.h"
  45 #include "physics.h"
  46 #include "vec.h"
  47 #include "pbc.h"
  48 #include "smalloc.h"
  49 #include "mdrun.h"
  50 #include "nrnb.h"
  51 #include "domdec.h"
  52 #include "partdec.h"
  53 #include "mtop_util.h"
  54 #include "gmxfio.h"
  55 #include "gmx_omp_nthreads.h"
  56 #include "gmx_omp.h"
  57
  58 typedef struct {
  59     int    b0;         /* first constraint for this thread */
  60     int    b1;         /* b1-1 is the last constraint for this thread */
  61     int    nind;       /* number of indices */
  62     int   *ind;        /* constraint index for updating atom data */
  63     int    nind_r;     /* number of indices */
  64     int   *ind_r;      /* constraint index for updating atom data */
  65     int    ind_nalloc; /* allocation size of ind and ind_r */
  66     tensor vir_r_m_dr; /* temporary variable for virial calculation */
  67 } lincs_thread_t;
  68
  69 typedef struct gmx_lincsdata {
  70     int             ncg;          /* the global number of constraints */
  71     int             ncg_flex;     /* the global number of flexible constraints */
  72     int             ncg_triangle; /* the global number of constraints in triangles */
  73     int             nIter;        /* the number of iterations */
  74     int             nOrder;       /* the order of the matrix expansion */
  75     int             nc;           /* the number of constraints */
  76     int             nc_alloc;     /* the number we allocated memory for */
  77     int             ncc;          /* the number of constraint connections */
  78     int             ncc_alloc;    /* the number we allocated memory for */
  79     real            matlam;       /* the FE lambda value used for filling blc and blmf */
  80     real           *bllen0;       /* the reference distance in topology A */
  81     real           *ddist;        /* the reference distance in top B - the r.d. in top A */
  82     int            *bla;          /* the atom pairs involved in the constraints */
  83     real           *blc;          /* 1/sqrt(invmass1 + invmass2) */
  84     real           *blc1;         /* as blc, but with all masses 1 */
  85     int            *blnr;         /* index into blbnb and blmf */
  86     int            *blbnb;        /* list of constraint connections */
  87     int             ntriangle;    /* the local number of constraints in triangles */
  88     int            *triangle;     /* the list of triangle constraints */
  89     int            *tri_bits;     /* the bits tell if the matrix element should be used */
  90     int             ncc_triangle; /* the number of constraint connections in triangles */
  91     gmx_bool        bCommIter;    /* communicate before each LINCS interation */
  92     real           *blmf;         /* matrix of mass factors for constraint connections */
  93     real           *blmf1;        /* as blmf, but with all masses 1 */
  94     real           *bllen;        /* the reference bond length */
  95     int             nth;          /* The number of threads doing LINCS */
  96     lincs_thread_t *th;           /* LINCS thread division */
  97     unsigned       *atf;          /* atom flags for thread parallelization */
  98     int             atf_nalloc;   /* allocation size of atf */
  99     /* arrays for temporary storage in the LINCS algorithm */
 100     rvec           *tmpv;
 101     real           *tmpncc;
 102     real           *tmp1;
 103     real           *tmp2;
 104     real           *tmp3;
 105     real           *tmp4;
 106     real           *mlambda; /* the Lagrange multipliers * -1 */
 107     /* storage for the constraint RMS relative deviation output */
 108     real            rmsd_data[3];
 109 } t_gmx_lincsdata;
 110
 111 real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
 112 {
 113     return lincsd->rmsd_data;
 114 }
 115
 116 real lincs_rmsd(struct gmx_lincsdata *lincsd, gmx_bool bSD2)
 117 {
 118     if (lincsd->rmsd_data[0] > 0)
 119     {
 120         return sqrt(lincsd->rmsd_data[bSD2 ? 2 : 1]/lincsd->rmsd_data[0]);
 121     }
 122     else
 123     {
 124         return 0;
 125     }
 126 }
 127
 128 /* Do a set of nrec LINCS matrix multiplications.
 129  * This function will return with up to date thread-local
 130  * constraint data, without an OpenMP barrier.
 131  */
 132 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
 133                                 int b0, int b1,
 134                                 const real *blcc,
 135                                 real *rhs1, real *rhs2, real *sol)
 136 {
 137     int        nrec, rec, b, j, n, nr0, nr1;
 138     real       mvb, *swap;
 139     int        ntriangle, tb, bits;
 140     const int *blnr     = lincsd->blnr, *blbnb = lincsd->blbnb;
 141     const int *triangle = lincsd->triangle, *tri_bits = lincsd->tri_bits;
 142
 143     ntriangle = lincsd->ntriangle;
 144     nrec      = lincsd->nOrder;
 145
 146     for (rec = 0; rec < nrec; rec++)
 147     {
 148 #pragma omp barrier
 149         for (b = b0; b < b1; b++)
 150         {
 151             mvb = 0;
 152             for (n = blnr[b]; n < blnr[b+1]; n++)
 153             {
 154                 j   = blbnb[n];
 155                 mvb = mvb + blcc[n]*rhs1[j];
 156             }
 157             rhs2[b] = mvb;
 158             sol[b]  = sol[b] + mvb;
 159         }
 160         swap = rhs1;
 161         rhs1 = rhs2;
 162         rhs2 = swap;
 163     } /* nrec*(ncons+2*nrtot) flops */
 164
 165     if (ntriangle > 0)
 166     {
 167         /* Perform an extra nrec recursions for only the constraints
 168          * involved in rigid triangles.
 169          * In this way their accuracy should come close to those of the other
 170          * constraints, since traingles of constraints can produce eigenvalues
 171          * around 0.7, while the effective eigenvalue for bond constraints
 172          * is around 0.4 (and 0.7*0.7=0.5).
 173          */
 174         /* We need to copy the temporary array, since only the elements
 175          * for constraints involved in triangles are updated and then
 176          * the pointers are swapped. This saving copying the whole arrary.
 177          * We need barrier as other threads might still be reading from rhs2.
 178          */
 179 #pragma omp barrier
 180         for (b = b0; b < b1; b++)
 181         {
 182             rhs2[b] = rhs1[b];
 183         }
 184 #pragma omp barrier
 185 #pragma omp master
 186         {
 187             for (rec = 0; rec < nrec; rec++)
 188             {
 189                 for (tb = 0; tb < ntriangle; tb++)
 190                 {
 191                     b    = triangle[tb];
 192                     bits = tri_bits[tb];
 193                     mvb  = 0;
 194                     nr0  = blnr[b];
 195                     nr1  = blnr[b+1];
 196                     for (n = nr0; n < nr1; n++)
 197                     {
 198                         if (bits & (1<<(n-nr0)))
 199                         {
 200                             j   = blbnb[n];
 201                             mvb = mvb + blcc[n]*rhs1[j];
 202                         }
 203                     }
 204                     rhs2[b] = mvb;
 205                     sol[b]  = sol[b] + mvb;
 206                 }
 207                 swap = rhs1;
 208                 rhs1 = rhs2;
 209                 rhs2 = swap;
 210             }
 211         } /* flops count is missing here */
 212
 213         /* We need a barrier here as the calling routine will continue
 214          * to operate on the thread-local constraints without barrier.
 215          */
 216 #pragma omp barrier
 217     }
 218 }
 219
 220 static void lincs_update_atoms_noind(int ncons, const int *bla,
 221                                      real prefac,
 222                                      const real *fac, rvec *r,
 223                                      const real *invmass,
 224                                      rvec *x)
 225 {
 226     int  b, i, j;
 227     real mvb, im1, im2, tmp0, tmp1, tmp2;
 228
 229     if (invmass != NULL)
 230     {
 231         for (b = 0; b < ncons; b++)
 232         {
 233             i        = bla[2*b];
 234             j        = bla[2*b+1];
 235             mvb      = prefac*fac[b];
 236             im1      = invmass[i];
 237             im2      = invmass[j];
 238             tmp0     = r[b][0]*mvb;
 239             tmp1     = r[b][1]*mvb;
 240             tmp2     = r[b][2]*mvb;
 241             x[i][0] -= tmp0*im1;
 242             x[i][1] -= tmp1*im1;
 243             x[i][2] -= tmp2*im1;
 244             x[j][0] += tmp0*im2;
 245             x[j][1] += tmp1*im2;
 246             x[j][2] += tmp2*im2;
 247         } /* 16 ncons flops */
 248     }
 249     else
 250     {
 251         for (b = 0; b < ncons; b++)
 252         {
 253             i        = bla[2*b];
 254             j        = bla[2*b+1];
 255             mvb      = prefac*fac[b];
 256             tmp0     = r[b][0]*mvb;
 257             tmp1     = r[b][1]*mvb;
 258             tmp2     = r[b][2]*mvb;
 259             x[i][0] -= tmp0;
 260             x[i][1] -= tmp1;
 261             x[i][2] -= tmp2;
 262             x[j][0] += tmp0;
 263             x[j][1] += tmp1;
 264             x[j][2] += tmp2;
 265         }
 266     }
 267 }
 268
 269 static void lincs_update_atoms_ind(int ncons, const int *ind, const int *bla,
 270                                    real prefac,
 271                                    const real *fac, rvec *r,
 272                                    const real *invmass,
 273                                    rvec *x)
 274 {
 275     int  bi, b, i, j;
 276     real mvb, im1, im2, tmp0, tmp1, tmp2;
 277
 278     if (invmass != NULL)
 279     {
 280         for (bi = 0; bi < ncons; bi++)
 281         {
 282             b        = ind[bi];
 283             i        = bla[2*b];
 284             j        = bla[2*b+1];
 285             mvb      = prefac*fac[b];
 286             im1      = invmass[i];
 287             im2      = invmass[j];
 288             tmp0     = r[b][0]*mvb;
 289             tmp1     = r[b][1]*mvb;
 290             tmp2     = r[b][2]*mvb;
 291             x[i][0] -= tmp0*im1;
 292             x[i][1] -= tmp1*im1;
 293             x[i][2] -= tmp2*im1;
 294             x[j][0] += tmp0*im2;
 295             x[j][1] += tmp1*im2;
 296             x[j][2] += tmp2*im2;
 297         } /* 16 ncons flops */
 298     }
 299     else
 300     {
 301         for (bi = 0; bi < ncons; bi++)
 302         {
 303             b        = ind[bi];
 304             i        = bla[2*b];
 305             j        = bla[2*b+1];
 306             mvb      = prefac*fac[b];
 307             tmp0     = r[b][0]*mvb;
 308             tmp1     = r[b][1]*mvb;
 309             tmp2     = r[b][2]*mvb;
 310             x[i][0] -= tmp0;
 311             x[i][1] -= tmp1;
 312             x[i][2] -= tmp2;
 313             x[j][0] += tmp0;
 314             x[j][1] += tmp1;
 315             x[j][2] += tmp2;
 316         } /* 16 ncons flops */
 317     }
 318 }
 319
 320 static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
 321                                real prefac,
 322                                const real *fac, rvec *r,
 323                                const real *invmass,
 324                                rvec *x)
 325 {
 326     if (li->nth == 1)
 327     {
 328         /* Single thread, we simply update for all constraints */
 329         lincs_update_atoms_noind(li->nc, li->bla, prefac, fac, r, invmass, x);
 330     }
 331     else
 332     {
 333         /* Update the atom vector components for our thread local
 334          * constraints that only access our local atom range.
 335          * This can be done without a barrier.
 336          */
 337         lincs_update_atoms_ind(li->th[th].nind, li->th[th].ind,
 338                                li->bla, prefac, fac, r, invmass, x);
 339
 340         if (li->th[li->nth].nind > 0)
 341         {
 342             /* Update the constraints that operate on atoms
 343              * in multiple thread atom blocks on the master thread.
 344              */
 345 #pragma omp barrier
 346 #pragma omp master
 347             {
 348                 lincs_update_atoms_ind(li->th[li->nth].nind,
 349                                        li->th[li->nth].ind,
 350                                        li->bla, prefac, fac, r, invmass, x);
 351             }
 352         }
 353     }
 354 }
 355
 356 /* LINCS projection, works on derivatives of the coordinates */
 357 static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 358                       struct gmx_lincsdata *lincsd, int th,
 359                       real *invmass,
 360                       int econq, real *dvdlambda,
 361                       gmx_bool bCalcVir, tensor rmdf)
 362 {
 363     int      b0, b1, b, i, j, k, n;
 364     real     tmp0, tmp1, tmp2, im1, im2, mvb, rlen, len, wfac, lam;
 365     rvec     dx;
 366     int     *bla, *blnr, *blbnb;
 367     rvec    *r;
 368     real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
 369
 370     b0 = lincsd->th[th].b0;
 371     b1 = lincsd->th[th].b1;
 372
 373     bla    = lincsd->bla;
 374     r      = lincsd->tmpv;
 375     blnr   = lincsd->blnr;
 376     blbnb  = lincsd->blbnb;
 377     if (econq != econqForce)
 378     {
 379         /* Use mass-weighted parameters */
 380         blc  = lincsd->blc;
 381         blmf = lincsd->blmf;
 382     }
 383     else
 384     {
 385         /* Use non mass-weighted parameters */
 386         blc  = lincsd->blc1;
 387         blmf = lincsd->blmf1;
 388     }
 389     blcc   = lincsd->tmpncc;
 390     rhs1   = lincsd->tmp1;
 391     rhs2   = lincsd->tmp2;
 392     sol    = lincsd->tmp3;
 393
 394     /* Compute normalized i-j vectors */
 395     if (pbc)
 396     {
 397         for (b = b0; b < b1; b++)
 398         {
 399             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 400             unitv(dx, r[b]);
 401         }
 402     }
 403     else
 404     {
 405         for (b = b0; b < b1; b++)
 406         {
 407             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
 408             unitv(dx, r[b]);
 409         } /* 16 ncons flops */
 410     }
 411
 412 #pragma omp barrier
 413     for (b = b0; b < b1; b++)
 414     {
 415         tmp0 = r[b][0];
 416         tmp1 = r[b][1];
 417         tmp2 = r[b][2];
 418         i    = bla[2*b];
 419         j    = bla[2*b+1];
 420         for (n = blnr[b]; n < blnr[b+1]; n++)
 421         {
 422             k       = blbnb[n];
 423             blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]);
 424         } /* 6 nr flops */
 425         mvb = blc[b]*(tmp0*(f[i][0] - f[j][0]) +
 426                       tmp1*(f[i][1] - f[j][1]) +
 427                       tmp2*(f[i][2] - f[j][2]));
 428         rhs1[b] = mvb;
 429         sol[b]  = mvb;
 430         /* 7 flops */
 431     }
 432     /* Together: 23*ncons + 6*nrtot flops */
 433
 434     lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
 435     /* nrec*(ncons+2*nrtot) flops */
 436
 437     if (econq == econqDeriv_FlexCon)
 438     {
 439         /* We only want to constraint the flexible constraints,
 440          * so we mask out the normal ones by setting sol to 0.
 441          */
 442         for (b = b0; b < b1; b++)
 443         {
 444             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 445             {
 446                 sol[b] = 0;
 447             }
 448         }
 449     }
 450
 451     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 452     for (b = b0; b < b1; b++)
 453     {
 454         sol[b] *= blc[b];
 455     }
 456
 457     /* When constraining forces, we should not use mass weighting,
 458      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 459      */
 460     lincs_update_atoms(lincsd, th, 1.0, sol, r,
 461                        (econq != econqForce) ? invmass : NULL, fp);
 462
 463     if (dvdlambda != NULL)
 464     {
 465 #pragma omp barrier
 466         for (b = b0; b < b1; b++)
 467         {
 468             *dvdlambda -= sol[b]*lincsd->ddist[b];
 469         }
 470         /* 10 ncons flops */
 471     }
 472
 473     if (bCalcVir)
 474     {
 475         /* Constraint virial,
 476          * determines sum r_bond x delta f,
 477          * where delta f is the constraint correction
 478          * of the quantity that is being constrained.
 479          */
 480         for (b = b0; b < b1; b++)
 481         {
 482             mvb = lincsd->bllen[b]*sol[b];
 483             for (i = 0; i < DIM; i++)
 484             {
 485                 tmp1 = mvb*r[b][i];
 486                 for (j = 0; j < DIM; j++)
 487                 {
 488                     rmdf[i][j] += tmp1*r[b][j];
 489                 }
 490             }
 491         } /* 23 ncons flops */
 492     }
 493 }
 494
 495 static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
 496                      struct gmx_lincsdata *lincsd, int th,
 497                      real *invmass,
 498                      t_commrec *cr,
 499                      gmx_bool bCalcLambda,
 500                      real wangle, int *warn,
 501                      real invdt, rvec *v,
 502                      gmx_bool bCalcVir, tensor vir_r_m_dr)
 503 {
 504     int      b0, b1, b, i, j, k, n, iter;
 505     real     tmp0, tmp1, tmp2, im1, im2, mvb, rlen, len, len2, dlen2, wfac;
 506     rvec     dx;
 507     int     *bla, *blnr, *blbnb;
 508     rvec    *r;
 509     real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
 510     int     *nlocat;
 511
 512     b0 = lincsd->th[th].b0;
 513     b1 = lincsd->th[th].b1;
 514
 515     bla     = lincsd->bla;
 516     r       = lincsd->tmpv;
 517     blnr    = lincsd->blnr;
 518     blbnb   = lincsd->blbnb;
 519     blc     = lincsd->blc;
 520     blmf    = lincsd->blmf;
 521     bllen   = lincsd->bllen;
 522     blcc    = lincsd->tmpncc;
 523     rhs1    = lincsd->tmp1;
 524     rhs2    = lincsd->tmp2;
 525     sol     = lincsd->tmp3;
 526     blc_sol = lincsd->tmp4;
 527     mlambda = lincsd->mlambda;
 528
 529     if (DOMAINDECOMP(cr) && cr->dd->constraints)
 530     {
 531         nlocat = dd_constraints_nlocalatoms(cr->dd);
 532     }
 533     else if (PARTDECOMP(cr))
 534     {
 535         nlocat = pd_constraints_nlocalatoms(cr->pd);
 536     }
 537     else
 538     {
 539         nlocat = NULL;
 540     }
 541
 542     if (pbc)
 543     {
 544         /* Compute normalized i-j vectors */
 545         for (b = b0; b < b1; b++)
 546         {
 547             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 548             unitv(dx, r[b]);
 549         }
 550 #pragma omp barrier
 551         for (b = b0; b < b1; b++)
 552         {
 553             for (n = blnr[b]; n < blnr[b+1]; n++)
 554             {
 555                 blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
 556             }
 557             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 558             mvb     = blc[b]*(iprod(r[b], dx) - bllen[b]);
 559             rhs1[b] = mvb;
 560             sol[b]  = mvb;
 561         }
 562     }
 563     else
 564     {
 565         /* Compute normalized i-j vectors */
 566         for (b = b0; b < b1; b++)
 567         {
 568             i       = bla[2*b];
 569             j       = bla[2*b+1];
 570             tmp0    = x[i][0] - x[j][0];
 571             tmp1    = x[i][1] - x[j][1];
 572             tmp2    = x[i][2] - x[j][2];
 573             rlen    = gmx_invsqrt(tmp0*tmp0+tmp1*tmp1+tmp2*tmp2);
 574             r[b][0] = rlen*tmp0;
 575             r[b][1] = rlen*tmp1;
 576             r[b][2] = rlen*tmp2;
 577         } /* 16 ncons flops */
 578
 579 #pragma omp barrier
 580         for (b = b0; b < b1; b++)
 581         {
 582             tmp0 = r[b][0];
 583             tmp1 = r[b][1];
 584             tmp2 = r[b][2];
 585             len  = bllen[b];
 586             i    = bla[2*b];
 587             j    = bla[2*b+1];
 588             for (n = blnr[b]; n < blnr[b+1]; n++)
 589             {
 590                 k       = blbnb[n];
 591                 blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]);
 592             } /* 6 nr flops */
 593             mvb = blc[b]*(tmp0*(xp[i][0] - xp[j][0]) +
 594                           tmp1*(xp[i][1] - xp[j][1]) +
 595                           tmp2*(xp[i][2] - xp[j][2]) - len);
 596             rhs1[b] = mvb;
 597             sol[b]  = mvb;
 598             /* 10 flops */
 599         }
 600         /* Together: 26*ncons + 6*nrtot flops */
 601     }
 602
 603     lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
 604     /* nrec*(ncons+2*nrtot) flops */
 605
 606     for (b = b0; b < b1; b++)
 607     {
 608         mlambda[b] = blc[b]*sol[b];
 609     }
 610
 611     /* Update the coordinates */
 612     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
 613
 614     /*
 615      ********  Correction for centripetal effects  ********
 616      */
 617
 618     wfac = cos(DEG2RAD*wangle);
 619     wfac = wfac*wfac;
 620
 621     for (iter = 0; iter < lincsd->nIter; iter++)
 622     {
 623         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints) ||
 624             PARTDECOMP(cr))
 625         {
 626 #pragma omp barrier
 627 #pragma omp master
 628             {
 629                 /* Communicate the corrected non-local coordinates */
 630                 if (DOMAINDECOMP(cr))
 631                 {
 632                     dd_move_x_constraints(cr->dd, box, xp, NULL);
 633                 }
 634                 else
 635                 {
 636                     pd_move_x_constraints(cr, xp, NULL);
 637                 }
 638             }
 639         }
 640
 641 #pragma omp barrier
 642         for (b = b0; b < b1; b++)
 643         {
 644             len = bllen[b];
 645             if (pbc)
 646             {
 647                 pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 648             }
 649             else
 650             {
 651                 rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
 652             }
 653             len2  = len*len;
 654             dlen2 = 2*len2 - norm2(dx);
 655             if (dlen2 < wfac*len2 && (nlocat == NULL || nlocat[b]))
 656             {
 657                 *warn = b;
 658             }
 659             if (dlen2 > 0)
 660             {
 661                 mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
 662             }
 663             else
 664             {
 665                 mvb = blc[b]*len;
 666             }
 667             rhs1[b] = mvb;
 668             sol[b]  = mvb;
 669         } /* 20*ncons flops */
 670
 671         lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
 672         /* nrec*(ncons+2*nrtot) flops */
 673
 674         for (b = b0; b < b1; b++)
 675         {
 676             mvb         = blc[b]*sol[b];
 677             blc_sol[b]  = mvb;
 678             mlambda[b] += mvb;
 679         }
 680
 681         /* Update the coordinates */
 682         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
 683     }
 684     /* nit*ncons*(37+9*nrec) flops */
 685
 686     if (v != NULL)
 687     {
 688         /* Update the velocities */
 689         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
 690         /* 16 ncons flops */
 691     }
 692
 693     if (nlocat != NULL && bCalcLambda)
 694     {
 695         /* In lincs_update_atoms thread might cross-read mlambda */
 696 #pragma omp barrier
 697
 698         /* Only account for local atoms */
 699         for (b = b0; b < b1; b++)
 700         {
 701             mlambda[b] *= 0.5*nlocat[b];
 702         }
 703     }
 704
 705     if (bCalcVir)
 706     {
 707         /* Constraint virial */
 708         for (b = b0; b < b1; b++)
 709         {
 710             tmp0 = -bllen[b]*mlambda[b];
 711             for (i = 0; i < DIM; i++)
 712             {
 713                 tmp1 = tmp0*r[b][i];
 714                 for (j = 0; j < DIM; j++)
 715                 {
 716                     vir_r_m_dr[i][j] -= tmp1*r[b][j];
 717                 }
 718             }
 719         } /* 22 ncons flops */
 720     }
 721
 722     /* Total:
 723      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
 724      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
 725      *
 726      * (26+nrec)*ncons + (6+2*nrec)*nrtot
 727      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
 728      * if nit=1
 729      * (63+nrec)*ncons + (6+4*nrec)*nrtot
 730      */
 731 }
 732
 733 void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
 734 {
 735     int        i, a1, a2, n, k, sign, center;
 736     int        end, nk, kk;
 737     const real invsqrt2 = 0.7071067811865475244;
 738
 739     for (i = 0; (i < li->nc); i++)
 740     {
 741         a1          = li->bla[2*i];
 742         a2          = li->bla[2*i+1];
 743         li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
 744         li->blc1[i] = invsqrt2;
 745     }
 746
 747     /* Construct the coupling coefficient matrix blmf */
 748     li->ntriangle    = 0;
 749     li->ncc_triangle = 0;
 750     for (i = 0; (i < li->nc); i++)
 751     {
 752         a1 = li->bla[2*i];
 753         a2 = li->bla[2*i+1];
 754         for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
 755         {
 756             k = li->blbnb[n];
 757             if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
 758             {
 759                 sign = -1;
 760             }
 761             else
 762             {
 763                 sign = 1;
 764             }
 765             if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
 766             {
 767                 center = a1;
 768                 end    = a2;
 769             }
 770             else
 771             {
 772                 center = a2;
 773                 end    = a1;
 774             }
 775             li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
 776             li->blmf1[n] = sign*0.5;
 777             if (li->ncg_triangle > 0)
 778             {
 779                 /* Look for constraint triangles */
 780                 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
 781                 {
 782                     kk = li->blbnb[nk];
 783                     if (kk != i && kk != k &&
 784                         (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
 785                     {
 786                         if (li->ntriangle == 0 ||
 787                             li->triangle[li->ntriangle-1] < i)
 788                         {
 789                             /* Add this constraint to the triangle list */
 790                             li->triangle[li->ntriangle] = i;
 791                             li->tri_bits[li->ntriangle] = 0;
 792                             li->ntriangle++;
 793                             if (li->blnr[i+1] - li->blnr[i] > sizeof(li->tri_bits[0])*8 - 1)
 794                             {
 795                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
 796                                           li->blnr[i+1] - li->blnr[i],
 797                                           sizeof(li->tri_bits[0])*8-1);
 798                             }
 799                         }
 800                         li->tri_bits[li->ntriangle-1] |= (1<<(n-li->blnr[i]));
 801                         li->ncc_triangle++;
 802                     }
 803                 }
 804             }
 805         }
 806     }
 807
 808     if (debug)
 809     {
 810         fprintf(debug, "Of the %d constraints %d participate in triangles\n",
 811                 li->nc, li->ntriangle);
 812         fprintf(debug, "There are %d couplings of which %d in triangles\n",
 813                 li->ncc, li->ncc_triangle);
 814     }
 815
 816     /* Set matlam,
 817      * so we know with which lambda value the masses have been set.
 818      */
 819     li->matlam = lambda;
 820 }
 821
 822 static int count_triangle_constraints(t_ilist *ilist, t_blocka *at2con)
 823 {
 824     int      ncon1, ncon_tot;
 825     int      c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
 826     int      ncon_triangle;
 827     gmx_bool bTriangle;
 828     t_iatom *ia1, *ia2, *iap;
 829
 830     ncon1    = ilist[F_CONSTR].nr/3;
 831     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
 832
 833     ia1 = ilist[F_CONSTR].iatoms;
 834     ia2 = ilist[F_CONSTRNC].iatoms;
 835
 836     ncon_triangle = 0;
 837     for (c0 = 0; c0 < ncon_tot; c0++)
 838     {
 839         bTriangle = FALSE;
 840         iap       = constr_iatomptr(ncon1, ia1, ia2, c0);
 841         a00       = iap[1];
 842         a01       = iap[2];
 843         for (n1 = at2con->index[a01]; n1 < at2con->index[a01+1]; n1++)
 844         {
 845             c1 = at2con->a[n1];
 846             if (c1 != c0)
 847             {
 848                 iap = constr_iatomptr(ncon1, ia1, ia2, c1);
 849                 a10 = iap[1];
 850                 a11 = iap[2];
 851                 if (a10 == a01)
 852                 {
 853                     ac1 = a11;
 854                 }
 855                 else
 856                 {
 857                     ac1 = a10;
 858                 }
 859                 for (n2 = at2con->index[ac1]; n2 < at2con->index[ac1+1]; n2++)
 860                 {
 861                     c2 = at2con->a[n2];
 862                     if (c2 != c0 && c2 != c1)
 863                     {
 864                         iap = constr_iatomptr(ncon1, ia1, ia2, c2);
 865                         a20 = iap[1];
 866                         a21 = iap[2];
 867                         if (a20 == a00 || a21 == a00)
 868                         {
 869                             bTriangle = TRUE;
 870                         }
 871                     }
 872                 }
 873             }
 874         }
 875         if (bTriangle)
 876         {
 877             ncon_triangle++;
 878         }
 879     }
 880
 881     return ncon_triangle;
 882 }
 883
 884 static gmx_bool more_than_two_sequential_constraints(const t_ilist  *ilist,
 885                                                      const t_blocka *at2con)
 886 {
 887     t_iatom  *ia1, *ia2, *iap;
 888     int       ncon1, ncon_tot, c;
 889     int       a1, a2;
 890     gmx_bool  bMoreThanTwoSequentialConstraints;
 891
 892     ncon1    = ilist[F_CONSTR].nr/3;
 893     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
 894
 895     ia1 = ilist[F_CONSTR].iatoms;
 896     ia2 = ilist[F_CONSTRNC].iatoms;
 897
 898     bMoreThanTwoSequentialConstraints = FALSE;
 899     for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
 900     {
 901         iap = constr_iatomptr(ncon1, ia1, ia2, c);
 902         a1  = iap[1];
 903         a2  = iap[2];
 904         /* Check if this constraint has constraints connected at both atoms */
 905         if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
 906             at2con->index[a2+1] - at2con->index[a2] > 1)
 907         {
 908             bMoreThanTwoSequentialConstraints = TRUE;
 909         }
 910     }
 911
 912     return bMoreThanTwoSequentialConstraints;
 913 }
 914
 915 static int int_comp(const void *a, const void *b)
 916 {
 917     return (*(int *)a) - (*(int *)b);
 918 }
 919
 920 gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
 921                            int nflexcon_global, t_blocka *at2con,
 922                            gmx_bool bPLINCS, int nIter, int nProjOrder)
 923 {
 924     struct gmx_lincsdata *li;
 925     int                   mb;
 926     gmx_moltype_t        *molt;
 927
 928     if (fplog)
 929     {
 930         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
 931                 bPLINCS ? " Parallel" : "");
 932     }
 933
 934     snew(li, 1);
 935
 936     li->ncg      =
 937         gmx_mtop_ftype_count(mtop, F_CONSTR) +
 938         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
 939     li->ncg_flex = nflexcon_global;
 940
 941     li->nIter  = nIter;
 942     li->nOrder = nProjOrder;
 943
 944     li->ncg_triangle = 0;
 945     li->bCommIter    = FALSE;
 946     for (mb = 0; mb < mtop->nmolblock; mb++)
 947     {
 948         molt              = &mtop->moltype[mtop->molblock[mb].type];
 949         li->ncg_triangle +=
 950             mtop->molblock[mb].nmol*
 951             count_triangle_constraints(molt->ilist,
 952                                        &at2con[mtop->molblock[mb].type]);
 953         if (bPLINCS && li->bCommIter == FALSE)
 954         {
 955             /* Check if we need to communicate not only before LINCS,
 956              * but also before each iteration.
 957              * The check for only two sequential constraints is only
 958              * useful for the common case of H-bond only constraints.
 959              * With more effort we could also make it useful for small
 960              * molecules with nr. sequential constraints <= nOrder-1.
 961              */
 962             li->bCommIter = (li->nOrder < 1 || more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]));
 963         }
 964     }
 965     if (debug && bPLINCS)
 966     {
 967         fprintf(debug, "PLINCS communication before each iteration: %d\n",
 968                 li->bCommIter);
 969     }
 970
 971     /* LINCS can run on any number of threads.
 972      * Currently the number is fixed for the whole simulation,
 973      * but it could be set in set_lincs().
 974      */
 975     li->nth = gmx_omp_nthreads_get(emntLINCS);
 976     if (li->nth == 1)
 977     {
 978         snew(li->th, 1);
 979     }
 980     else
 981     {
 982         /* Allocate an extra elements for "thread-overlap" constraints */
 983         snew(li->th, li->nth+1);
 984     }
 985     if (debug)
 986     {
 987         fprintf(debug, "LINCS: using %d threads\n", li->nth);
 988     }
 989
 990     if (bPLINCS || li->ncg_triangle > 0)
 991     {
 992         please_cite(fplog, "Hess2008a");
 993     }
 994     else
 995     {
 996         please_cite(fplog, "Hess97a");
 997     }
 998
 999     if (fplog)
1000     {
1001         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1002         if (bPLINCS)
1003         {
1004             fprintf(fplog, "There are inter charge-group constraints,\n"
1005                     "will communicate selected coordinates each lincs iteration\n");
1006         }
1007         if (li->ncg_triangle > 0)
1008         {
1009             fprintf(fplog,
1010                     "%d constraints are involved in constraint triangles,\n"
1011                     "will apply an additional matrix expansion of order %d for couplings\n"
1012                     "between constraints inside triangles\n",
1013                     li->ncg_triangle, li->nOrder);
1014         }
1015     }
1016
1017     return li;
1018 }
1019
1020 /* Sets up the work division over the threads */
1021 static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
1022 {
1023     lincs_thread_t *li_m;
1024     int             th;
1025     unsigned       *atf;
1026     int             a;
1027
1028     if (natoms > li->atf_nalloc)
1029     {
1030         li->atf_nalloc = over_alloc_large(natoms);
1031         srenew(li->atf, li->atf_nalloc);
1032     }
1033
1034     atf = li->atf;
1035     /* Clear the atom flags */
1036     for (a = 0; a < natoms; a++)
1037     {
1038         atf[a] = 0;
1039     }
1040
1041     for (th = 0; th < li->nth; th++)
1042     {
1043         lincs_thread_t *li_th;
1044         int             b;
1045
1046         li_th = &li->th[th];
1047
1048         /* The constraints are divided equally over the threads */
1049         li_th->b0 = (li->nc* th   )/li->nth;
1050         li_th->b1 = (li->nc*(th+1))/li->nth;
1051
1052         if (th < sizeof(*atf)*8)
1053         {
1054             /* For each atom set a flag for constraints from each */
1055             for (b = li_th->b0; b < li_th->b1; b++)
1056             {
1057                 atf[li->bla[b*2]  ] |= (1U<<th);
1058                 atf[li->bla[b*2+1]] |= (1U<<th);
1059             }
1060         }
1061     }
1062
1063 #pragma omp parallel for num_threads(li->nth) schedule(static)
1064     for (th = 0; th < li->nth; th++)
1065     {
1066         lincs_thread_t *li_th;
1067         unsigned        mask;
1068         int             b;
1069
1070         li_th = &li->th[th];
1071
1072         if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
1073         {
1074             li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
1075             srenew(li_th->ind, li_th->ind_nalloc);
1076             srenew(li_th->ind_r, li_th->ind_nalloc);
1077         }
1078
1079         if (th < sizeof(*atf)*8)
1080         {
1081             mask = (1U<<th) - 1U;
1082
1083             li_th->nind   = 0;
1084             li_th->nind_r = 0;
1085             for (b = li_th->b0; b < li_th->b1; b++)
1086             {
1087                 /* We let the constraint with the lowest thread index
1088                  * operate on atoms with constraints from multiple threads.
1089                  */
1090                 if (((atf[li->bla[b*2]]   & mask) == 0) &&
1091                     ((atf[li->bla[b*2+1]] & mask) == 0))
1092                 {
1093                     /* Add the constraint to the local atom update index */
1094                     li_th->ind[li_th->nind++] = b;
1095                 }
1096                 else
1097                 {
1098                     /* Add the constraint to the rest block */
1099                     li_th->ind_r[li_th->nind_r++] = b;
1100                 }
1101             }
1102         }
1103         else
1104         {
1105             /* We are out of bits, assign all constraints to rest */
1106             for (b = li_th->b0; b < li_th->b1; b++)
1107             {
1108                 li_th->ind_r[li_th->nind_r++] = b;
1109             }
1110         }
1111     }
1112
1113     /* We need to copy all constraints which have not be assigned
1114      * to a thread to a separate list which will be handled by one thread.
1115      */
1116     li_m = &li->th[li->nth];
1117
1118     li_m->nind = 0;
1119     for (th = 0; th < li->nth; th++)
1120     {
1121         lincs_thread_t *li_th;
1122         int             b;
1123
1124         li_th   = &li->th[th];
1125
1126         if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
1127         {
1128             li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
1129             srenew(li_m->ind, li_m->ind_nalloc);
1130         }
1131
1132         for (b = 0; b < li_th->nind_r; b++)
1133         {
1134             li_m->ind[li_m->nind++] = li_th->ind_r[b];
1135         }
1136
1137         if (debug)
1138         {
1139             fprintf(debug, "LINCS thread %d: %d constraints\n",
1140                     th, li_th->nind);
1141         }
1142     }
1143
1144     if (debug)
1145     {
1146         fprintf(debug, "LINCS thread r: %d constraints\n",
1147                 li_m->nind);
1148     }
1149 }
1150
1151
1152 void set_lincs(t_idef *idef, t_mdatoms *md,
1153                gmx_bool bDynamics, t_commrec *cr,
1154                struct gmx_lincsdata *li)
1155 {
1156     int          start, natoms, nflexcon;
1157     t_blocka     at2con;
1158     t_iatom     *iatom;
1159     int          i, k, ncc_alloc, ni, con, nconnect, concon;
1160     int          type, a1, a2;
1161     real         lenA = 0, lenB;
1162     gmx_bool     bLocal;
1163
1164     li->nc  = 0;
1165     li->ncc = 0;
1166     /* Zero the thread index ranges.
1167      * Otherwise without local constraints we could return with old ranges.
1168      */
1169     for (i = 0; i < li->nth; i++)
1170     {
1171         li->th[i].b0   = 0;
1172         li->th[i].b1   = 0;
1173         li->th[i].nind = 0;
1174     }
1175     if (li->nth > 1)
1176     {
1177         li->th[li->nth].nind = 0;
1178     }
1179
1180     /* This is the local topology, so there are only F_CONSTR constraints */
1181     if (idef->il[F_CONSTR].nr == 0)
1182     {
1183         /* There are no constraints,
1184          * we do not need to fill any data structures.
1185          */
1186         return;
1187     }
1188
1189     if (debug)
1190     {
1191         fprintf(debug, "Building the LINCS connectivity\n");
1192     }
1193
1194     if (DOMAINDECOMP(cr))
1195     {
1196         if (cr->dd->constraints)
1197         {
1198             dd_get_constraint_range(cr->dd, &start, &natoms);
1199         }
1200         else
1201         {
1202             natoms = cr->dd->nat_home;
1203         }
1204         start = 0;
1205     }
1206     else if (PARTDECOMP(cr))
1207     {
1208         pd_get_constraint_range(cr->pd, &start, &natoms);
1209     }
1210     else
1211     {
1212         start  = md->start;
1213         natoms = md->homenr;
1214     }
1215     at2con = make_at2con(start, natoms, idef->il, idef->iparams, bDynamics,
1216                          &nflexcon);
1217
1218
1219     if (idef->il[F_CONSTR].nr/3 > li->nc_alloc || li->nc_alloc == 0)
1220     {
1221         li->nc_alloc = over_alloc_dd(idef->il[F_CONSTR].nr/3);
1222         srenew(li->bllen0, li->nc_alloc);
1223         srenew(li->ddist, li->nc_alloc);
1224         srenew(li->bla, 2*li->nc_alloc);
1225         srenew(li->blc, li->nc_alloc);
1226         srenew(li->blc1, li->nc_alloc);
1227         srenew(li->blnr, li->nc_alloc+1);
1228         srenew(li->bllen, li->nc_alloc);
1229         srenew(li->tmpv, li->nc_alloc);
1230         srenew(li->tmp1, li->nc_alloc);
1231         srenew(li->tmp2, li->nc_alloc);
1232         srenew(li->tmp3, li->nc_alloc);
1233         srenew(li->tmp4, li->nc_alloc);
1234         srenew(li->mlambda, li->nc_alloc);
1235         if (li->ncg_triangle > 0)
1236         {
1237             /* This is allocating too much, but it is difficult to improve */
1238             srenew(li->triangle, li->nc_alloc);
1239             srenew(li->tri_bits, li->nc_alloc);
1240         }
1241     }
1242
1243     iatom = idef->il[F_CONSTR].iatoms;
1244
1245     ncc_alloc   = li->ncc_alloc;
1246     li->blnr[0] = 0;
1247
1248     ni = idef->il[F_CONSTR].nr/3;
1249
1250     con           = 0;
1251     nconnect      = 0;
1252     li->blnr[con] = nconnect;
1253     for (i = 0; i < ni; i++)
1254     {
1255         bLocal = TRUE;
1256         type   = iatom[3*i];
1257         a1     = iatom[3*i+1];
1258         a2     = iatom[3*i+2];
1259         lenA   = idef->iparams[type].constr.dA;
1260         lenB   = idef->iparams[type].constr.dB;
1261         /* Skip the flexible constraints when not doing dynamics */
1262         if (bDynamics || lenA != 0 || lenB != 0)
1263         {
1264             li->bllen0[con]  = lenA;
1265             li->ddist[con]   = lenB - lenA;
1266             /* Set the length to the topology A length */
1267             li->bllen[con]   = li->bllen0[con];
1268             li->bla[2*con]   = a1;
1269             li->bla[2*con+1] = a2;
1270             /* Construct the constraint connection matrix blbnb */
1271             for (k = at2con.index[a1-start]; k < at2con.index[a1-start+1]; k++)
1272             {
1273                 concon = at2con.a[k];
1274                 if (concon != i)
1275                 {
1276                     if (nconnect >= ncc_alloc)
1277                     {
1278                         ncc_alloc = over_alloc_small(nconnect+1);
1279                         srenew(li->blbnb, ncc_alloc);
1280                     }
1281                     li->blbnb[nconnect++] = concon;
1282                 }
1283             }
1284             for (k = at2con.index[a2-start]; k < at2con.index[a2-start+1]; k++)
1285             {
1286                 concon = at2con.a[k];
1287                 if (concon != i)
1288                 {
1289                     if (nconnect+1 > ncc_alloc)
1290                     {
1291                         ncc_alloc = over_alloc_small(nconnect+1);
1292                         srenew(li->blbnb, ncc_alloc);
1293                     }
1294                     li->blbnb[nconnect++] = concon;
1295                 }
1296             }
1297             li->blnr[con+1] = nconnect;
1298
1299             if (cr->dd == NULL)
1300             {
1301                 /* Order the blbnb matrix to optimize memory access */
1302                 qsort(&(li->blbnb[li->blnr[con]]), li->blnr[con+1]-li->blnr[con],
1303                       sizeof(li->blbnb[0]), int_comp);
1304             }
1305             /* Increase the constraint count */
1306             con++;
1307         }
1308     }
1309
1310     done_blocka(&at2con);
1311
1312     /* This is the real number of constraints,
1313      * without dynamics the flexible constraints are not present.
1314      */
1315     li->nc = con;
1316
1317     li->ncc = li->blnr[con];
1318     if (cr->dd == NULL)
1319     {
1320         /* Since the matrix is static, we can free some memory */
1321         ncc_alloc = li->ncc;
1322         srenew(li->blbnb, ncc_alloc);
1323     }
1324
1325     if (ncc_alloc > li->ncc_alloc)
1326     {
1327         li->ncc_alloc = ncc_alloc;
1328         srenew(li->blmf, li->ncc_alloc);
1329         srenew(li->blmf1, li->ncc_alloc);
1330         srenew(li->tmpncc, li->ncc_alloc);
1331     }
1332
1333     if (debug)
1334     {
1335         fprintf(debug, "Number of constraints is %d, couplings %d\n",
1336                 li->nc, li->ncc);
1337     }
1338
1339     if (li->nth == 1)
1340     {
1341         li->th[0].b0 = 0;
1342         li->th[0].b1 = li->nc;
1343     }
1344     else
1345     {
1346         lincs_thread_setup(li, md->nr);
1347     }
1348
1349     set_lincs_matrix(li, md->invmass, md->lambda);
1350 }
1351
1352 static void lincs_warning(FILE *fplog,
1353                           gmx_domdec_t *dd, rvec *x, rvec *xprime, t_pbc *pbc,
1354                           int ncons, int *bla, real *bllen, real wangle,
1355                           int maxwarn, int *warncount)
1356 {
1357     int  b, i, j;
1358     rvec v0, v1;
1359     real wfac, d0, d1, cosine;
1360     char buf[STRLEN];
1361
1362     wfac = cos(DEG2RAD*wangle);
1363
1364     sprintf(buf, "bonds that rotated more than %g degrees:\n"
1365             " atom 1 atom 2  angle  previous, current, constraint length\n",
1366             wangle);
1367     fprintf(stderr, "%s", buf);
1368     if (fplog)
1369     {
1370         fprintf(fplog, "%s", buf);
1371     }
1372
1373     for (b = 0; b < ncons; b++)
1374     {
1375         i = bla[2*b];
1376         j = bla[2*b+1];
1377         if (pbc)
1378         {
1379             pbc_dx_aiuc(pbc, x[i], x[j], v0);
1380             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
1381         }
1382         else
1383         {
1384             rvec_sub(x[i], x[j], v0);
1385             rvec_sub(xprime[i], xprime[j], v1);
1386         }
1387         d0     = norm(v0);
1388         d1     = norm(v1);
1389         cosine = iprod(v0, v1)/(d0*d1);
1390         if (cosine < wfac)
1391         {
1392             sprintf(buf, " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
1393                     ddglatnr(dd, i), ddglatnr(dd, j),
1394                     RAD2DEG*acos(cosine), d0, d1, bllen[b]);
1395             fprintf(stderr, "%s", buf);
1396             if (fplog)
1397             {
1398                 fprintf(fplog, "%s", buf);
1399             }
1400             if (!gmx_isfinite(d1))
1401             {
1402                 gmx_fatal(FARGS, "Bond length not finite.");
1403             }
1404
1405             (*warncount)++;
1406         }
1407     }
1408     if (*warncount > maxwarn)
1409     {
1410         too_many_constraint_warnings(econtLINCS, *warncount);
1411     }
1412 }
1413
1414 static void cconerr(gmx_domdec_t *dd,
1415                     int ncons, int *bla, real *bllen, rvec *x, t_pbc *pbc,
1416                     real *ncons_loc, real *ssd, real *max, int *imax)
1417 {
1418     real       len, d, ma, ssd2, r2;
1419     int       *nlocat, count, b, im;
1420     rvec       dx;
1421
1422     if (dd && dd->constraints)
1423     {
1424         nlocat = dd_constraints_nlocalatoms(dd);
1425     }
1426     else
1427     {
1428         nlocat = 0;
1429     }
1430
1431     ma    = 0;
1432     ssd2  = 0;
1433     im    = 0;
1434     count = 0;
1435     for (b = 0; b < ncons; b++)
1436     {
1437         if (pbc)
1438         {
1439             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
1440         }
1441         else
1442         {
1443             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
1444         }
1445         r2  = norm2(dx);
1446         len = r2*gmx_invsqrt(r2);
1447         d   = fabs(len/bllen[b]-1);
1448         if (d > ma && (nlocat == NULL || nlocat[b]))
1449         {
1450             ma = d;
1451             im = b;
1452         }
1453         if (nlocat == NULL)
1454         {
1455             ssd2 += d*d;
1456             count++;
1457         }
1458         else
1459         {
1460             ssd2  += nlocat[b]*d*d;
1461             count += nlocat[b];
1462         }
1463     }
1464
1465     *ncons_loc = (nlocat ? 0.5 : 1)*count;
1466     *ssd       = (nlocat ? 0.5 : 1)*ssd2;
1467     *max       = ma;
1468     *imax      = im;
1469 }
1470
1471 gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
1472                          t_inputrec *ir,
1473                          gmx_large_int_t step,
1474                          struct gmx_lincsdata *lincsd, t_mdatoms *md,
1475                          t_commrec *cr,
1476                          rvec *x, rvec *xprime, rvec *min_proj,
1477                          matrix box, t_pbc *pbc,
1478                          real lambda, real *dvdlambda,
1479                          real invdt, rvec *v,
1480                          gmx_bool bCalcVir, tensor vir_r_m_dr,
1481                          int econq,
1482                          t_nrnb *nrnb,
1483                          int maxwarn, int *warncount)
1484 {
1485     char      buf[STRLEN], buf2[22], buf3[STRLEN];
1486     int       i, warn, p_imax, error;
1487     real      ncons_loc, p_ssd, p_max = 0;
1488     rvec      dx;
1489     gmx_bool  bOK;
1490
1491     bOK = TRUE;
1492
1493     if (lincsd->nc == 0 && cr->dd == NULL)
1494     {
1495         if (bLog || bEner)
1496         {
1497             lincsd->rmsd_data[0] = 0;
1498             if (ir->eI == eiSD2 && v == NULL)
1499             {
1500                 i = 2;
1501             }
1502             else
1503             {
1504                 i = 1;
1505             }
1506             lincsd->rmsd_data[i] = 0;
1507         }
1508
1509         return bOK;
1510     }
1511
1512     if (econq == econqCoord)
1513     {
1514         if (ir->efep != efepNO)
1515         {
1516             if (md->nMassPerturbed && lincsd->matlam != md->lambda)
1517             {
1518                 set_lincs_matrix(lincsd, md->invmass, md->lambda);
1519             }
1520
1521             for (i = 0; i < lincsd->nc; i++)
1522             {
1523                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
1524             }
1525         }
1526
1527         if (lincsd->ncg_flex)
1528         {
1529             /* Set the flexible constraint lengths to the old lengths */
1530             if (pbc != NULL)
1531             {
1532                 for (i = 0; i < lincsd->nc; i++)
1533                 {
1534                     if (lincsd->bllen[i] == 0)
1535                     {
1536                         pbc_dx_aiuc(pbc, x[lincsd->bla[2*i]], x[lincsd->bla[2*i+1]], dx);
1537                         lincsd->bllen[i] = norm(dx);
1538                     }
1539                 }
1540             }
1541             else
1542             {
1543                 for (i = 0; i < lincsd->nc; i++)
1544                 {
1545                     if (lincsd->bllen[i] == 0)
1546                     {
1547                         lincsd->bllen[i] =
1548                             sqrt(distance2(x[lincsd->bla[2*i]],
1549                                            x[lincsd->bla[2*i+1]]));
1550                     }
1551                 }
1552             }
1553         }
1554
1555         if (bLog && fplog)
1556         {
1557             cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
1558                     &ncons_loc, &p_ssd, &p_max, &p_imax);
1559         }
1560
1561         /* This warn var can be updated by multiple threads
1562          * at the same time. But as we only need to detect
1563          * if a warning occured or not, this is not an issue.
1564          */
1565         warn = -1;
1566
1567         /* The OpenMP parallel region of constrain_lincs for coords */
1568 #pragma omp parallel num_threads(lincsd->nth)
1569         {
1570             int th = gmx_omp_get_thread_num();
1571
1572             clear_mat(lincsd->th[th].vir_r_m_dr);
1573
1574             do_lincs(x, xprime, box, pbc, lincsd, th,
1575                      md->invmass, cr,
1576                      bCalcVir || (ir->efep != efepNO),
1577                      ir->LincsWarnAngle, &warn,
1578                      invdt, v, bCalcVir,
1579                      th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
1580         }
1581
1582         if (ir->efep != efepNO)
1583         {
1584             real dt_2, dvdl = 0;
1585
1586             dt_2 = 1.0/(ir->delta_t*ir->delta_t);
1587             for (i = 0; (i < lincsd->nc); i++)
1588             {
1589                 dvdl -= lincsd->mlambda[i]*dt_2*lincsd->ddist[i];
1590             }
1591             *dvdlambda += dvdl;
1592         }
1593
1594         if (bLog && fplog && lincsd->nc > 0)
1595         {
1596             fprintf(fplog, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
1597             fprintf(fplog, "       Before LINCS          %.6f    %.6f %6d %6d\n",
1598                     sqrt(p_ssd/ncons_loc), p_max,
1599                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
1600                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
1601         }
1602         if (bLog || bEner)
1603         {
1604             cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
1605                     &ncons_loc, &p_ssd, &p_max, &p_imax);
1606             /* Check if we are doing the second part of SD */
1607             if (ir->eI == eiSD2 && v == NULL)
1608             {
1609                 i = 2;
1610             }
1611             else
1612             {
1613                 i = 1;
1614             }
1615             lincsd->rmsd_data[0] = ncons_loc;
1616             lincsd->rmsd_data[i] = p_ssd;
1617         }
1618         else
1619         {
1620             lincsd->rmsd_data[0] = 0;
1621             lincsd->rmsd_data[1] = 0;
1622             lincsd->rmsd_data[2] = 0;
1623         }
1624         if (bLog && fplog && lincsd->nc > 0)
1625         {
1626             fprintf(fplog,
1627                     "        After LINCS          %.6f    %.6f %6d %6d\n\n",
1628                     sqrt(p_ssd/ncons_loc), p_max,
1629                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
1630                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
1631         }
1632
1633         if (warn >= 0)
1634         {
1635             if (maxwarn >= 0)
1636             {
1637                 cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
1638                         &ncons_loc, &p_ssd, &p_max, &p_imax);
1639                 if (MULTISIM(cr))
1640                 {
1641                     sprintf(buf3, " in simulation %d", cr->ms->sim);
1642                 }
1643                 else
1644                 {
1645                     buf3[0] = 0;
1646                 }
1647                 sprintf(buf, "\nStep %s, time %g (ps)  LINCS WARNING%s\n"
1648                         "relative constraint deviation after LINCS:\n"
1649                         "rms %.6f, max %.6f (between atoms %d and %d)\n",
1650                         gmx_step_str(step, buf2), ir->init_t+step*ir->delta_t,
1651                         buf3,
1652                         sqrt(p_ssd/ncons_loc), p_max,
1653                         ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
1654                         ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
1655                 if (fplog)
1656                 {
1657                     fprintf(fplog, "%s", buf);
1658                 }
1659                 fprintf(stderr, "%s", buf);
1660                 lincs_warning(fplog, cr->dd, x, xprime, pbc,
1661                               lincsd->nc, lincsd->bla, lincsd->bllen,
1662                               ir->LincsWarnAngle, maxwarn, warncount);
1663             }
1664             bOK = (p_max < 0.5);
1665         }
1666
1667         if (lincsd->ncg_flex)
1668         {
1669             for (i = 0; (i < lincsd->nc); i++)
1670             {
1671                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
1672                 {
1673                     lincsd->bllen[i] = 0;
1674                 }
1675             }
1676         }
1677     }
1678     else
1679     {
1680         /* The OpenMP parallel region of constrain_lincs for derivatives */
1681 #pragma omp parallel num_threads(lincsd->nth)
1682         {
1683             int th = gmx_omp_get_thread_num();
1684
1685             do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
1686                       md->invmass, econq, ir->efep != efepNO ? dvdlambda : NULL,
1687                       bCalcVir, th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
1688         }
1689     }
1690
1691     if (bCalcVir && lincsd->nth > 1)
1692     {
1693         for (i = 1; i < lincsd->nth; i++)
1694         {
1695             m_add(vir_r_m_dr, lincsd->th[i].vir_r_m_dr, vir_r_m_dr);
1696         }
1697     }
1698
1699     /* count assuming nit=1 */
1700     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc);
1701     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
1702     if (lincsd->ntriangle > 0)
1703     {
1704         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
1705     }
1706     if (v)
1707     {
1708         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc*2);
1709     }
1710     if (bCalcVir)
1711     {
1712         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc);
1713     }
1714
1715     return bOK;
1716 }