#endif
typedef struct {
- int b0; /* first constraint for this thread */
- int b1; /* b1-1 is the last constraint for this thread */
+ int b0; /* first constraint for this task */
+ int b1; /* b1-1 is the last constraint for this task */
+ int ntriangle; /* the number of constraints in triangles */
+ int *triangle; /* the list of triangle constraints */
+ int *tri_bits; /* the bits tell if the matrix element should be used */
+ int tri_alloc; /* allocation size of triangle and tri_bits */
int nind; /* number of indices */
int *ind; /* constraint index for updating atom data */
int nind_r; /* number of indices */
tensor vir_r_m_dr; /* temporary variable for virial calculation */
real dhdlambda; /* temporary variable for lambda derivative */
real *simd_buf; /* Aligned work array for SIMD */
-} lincs_thread_t;
+} lincs_task_t;
typedef struct gmx_lincsdata {
int ncg; /* the global number of constraints */
int ncg_triangle; /* the global number of constraints in triangles */
int nIter; /* the number of iterations */
int nOrder; /* the order of the matrix expansion */
- int nc; /* the number of constraints */
+ int max_connect; /* the maximum number of constrains connected to a single atom */
+
+ int nc_real; /* the number of real constraints */
+ int nc; /* the number of constraints including padding for SIMD */
int nc_alloc; /* the number we allocated memory for */
int ncc; /* the number of constraint connections */
int ncc_alloc; /* the number we allocated memory for */
real matlam; /* the FE lambda value used for filling blc and blmf */
+ int *con_index; /* mapping from topology to LINCS constraints */
real *bllen0; /* the reference distance in topology A */
real *ddist; /* the reference distance in top B - the r.d. in top A */
int *bla; /* the atom pairs involved in the constraints */
int *blnr; /* index into blbnb and blmf */
int *blbnb; /* list of constraint connections */
int ntriangle; /* the local number of constraints in triangles */
- int *triangle; /* the list of triangle constraints */
- int *tri_bits; /* the bits tell if the matrix element should be used */
int ncc_triangle; /* the number of constraint connections in triangles */
gmx_bool bCommIter; /* communicate before each LINCS interation */
real *blmf; /* matrix of mass factors for constraint connections */
real *blmf1; /* as blmf, but with all masses 1 */
real *bllen; /* the reference bond length */
- int nth; /* The number of threads doing LINCS */
- lincs_thread_t *th; /* LINCS thread division */
+ int *nlocat; /* the local atom count per constraint, can be NULL */
+
+ int ntask; /* The number of tasks = #threads for LINCS */
+ lincs_task_t *task; /* LINCS thread division */
gmx_bitmask_t *atf; /* atom flags for thread parallelization */
int atf_nalloc; /* allocation size of atf */
+ gmx_bool bTaskDep; /* are the LINCS tasks interdependent? */
/* arrays for temporary storage in the LINCS algorithm */
rvec *tmpv;
real *tmpncc;
* constraint data, without an OpenMP barrier.
*/
static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
- int b0, int b1,
+ const lincs_task_t *li_task,
const real *blcc,
real *rhs1, real *rhs2, real *sol)
{
- int nrec, rec, b, j, n, nr0, nr1;
- real mvb, *swap;
- int ntriangle, tb, bits;
- const int *blnr = lincsd->blnr, *blbnb = lincsd->blbnb;
- const int *triangle = lincsd->triangle, *tri_bits = lincsd->tri_bits;
+ int b0, b1, nrec, rec;
+ const int *blnr = lincsd->blnr;
+ const int *blbnb = lincsd->blbnb;
- ntriangle = lincsd->ntriangle;
- nrec = lincsd->nOrder;
+ b0 = li_task->b0;
+ b1 = li_task->b1;
+ nrec = lincsd->nOrder;
for (rec = 0; rec < nrec; rec++)
{
+ int b;
+
+ if (lincsd->bTaskDep)
+ {
#pragma omp barrier
+ }
for (b = b0; b < b1; b++)
{
+ real mvb;
+ int n;
+
mvb = 0;
for (n = blnr[b]; n < blnr[b+1]; n++)
{
- j = blbnb[n];
- mvb = mvb + blcc[n]*rhs1[j];
+ mvb = mvb + blcc[n]*rhs1[blbnb[n]];
}
rhs2[b] = mvb;
sol[b] = sol[b] + mvb;
}
+
+ real *swap;
+
swap = rhs1;
rhs1 = rhs2;
rhs2 = swap;
} /* nrec*(ncons+2*nrtot) flops */
- if (ntriangle > 0)
+ if (lincsd->ntriangle > 0)
{
/* Perform an extra nrec recursions for only the constraints
* involved in rigid triangles.
* around 0.7, while the effective eigenvalue for bond constraints
* is around 0.4 (and 0.7*0.7=0.5).
*/
- /* We need to copy the temporary array, since only the elements
- * for constraints involved in triangles are updated and then
- * the pointers are swapped. This saves copying the whole array.
- * We need a barrier as other threads might still be reading from rhs2.
- */
-#pragma omp barrier
- for (b = b0; b < b1; b++)
+
+ if (lincsd->bTaskDep)
{
- rhs2[b] = rhs1[b];
- }
+ /* We need a barrier here, since other threads might still be
+ * reading the contents of rhs1 and/o rhs2.
+ * We could avoid this barrier by introducing two extra rhs
+ * arrays for the triangle constraints only.
+ */
#pragma omp barrier
-#pragma omp master
+ }
+
+ /* Constraints involved in a triangle are ensured to be in the same
+ * LINCS task. This means no barriers are required during the extra
+ * iterations for the triangle constraints.
+ */
+ const int *triangle = li_task->triangle;
+ const int *tri_bits = li_task->tri_bits;
+
+ for (rec = 0; rec < nrec; rec++)
{
- for (rec = 0; rec < nrec; rec++)
+ int tb;
+
+ for (tb = 0; tb < li_task->ntriangle; tb++)
{
- for (tb = 0; tb < ntriangle; tb++)
+ int b, bits, nr0, nr1, n;
+ real mvb;
+
+ b = triangle[tb];
+ bits = tri_bits[tb];
+ mvb = 0;
+ nr0 = blnr[b];
+ nr1 = blnr[b+1];
+ for (n = nr0; n < nr1; n++)
{
- b = triangle[tb];
- bits = tri_bits[tb];
- mvb = 0;
- nr0 = blnr[b];
- nr1 = blnr[b+1];
- for (n = nr0; n < nr1; n++)
+ if (bits & (1 << (n - nr0)))
{
- if (bits & (1<<(n-nr0)))
- {
- j = blbnb[n];
- mvb = mvb + blcc[n]*rhs1[j];
- }
+ mvb = mvb + blcc[n]*rhs1[blbnb[n]];
}
- rhs2[b] = mvb;
- sol[b] = sol[b] + mvb;
}
- swap = rhs1;
- rhs1 = rhs2;
- rhs2 = swap;
+ rhs2[b] = mvb;
+ sol[b] = sol[b] + mvb;
}
- } /* flops count is missing here */
- /* We need a barrier here as the calling routine will continue
- * to operate on the thread-local constraints without barrier.
- */
-#pragma omp barrier
+ real *swap;
+
+ swap = rhs1;
+ rhs1 = rhs2;
+ rhs2 = swap;
+ } /* nrec*(ntriangle + ncc_triangle*2) flops */
}
}
const real *invmass,
rvec *x)
{
- if (li->nth == 1)
+ if (li->ntask == 1)
{
/* Single thread, we simply update for all constraints */
- lincs_update_atoms_noind(li->nc, li->bla, prefac, fac, r, invmass, x);
+ lincs_update_atoms_noind(li->nc_real,
+ li->bla, prefac, fac, r, invmass, x);
}
else
{
* constraints that only access our local atom range.
* This can be done without a barrier.
*/
- lincs_update_atoms_ind(li->th[th].nind, li->th[th].ind,
+ lincs_update_atoms_ind(li->task[th].nind, li->task[th].ind,
li->bla, prefac, fac, r, invmass, x);
- if (li->th[li->nth].nind > 0)
+ if (li->task[li->ntask].nind > 0)
{
/* Update the constraints that operate on atoms
* in multiple thread atom blocks on the master thread.
#pragma omp barrier
#pragma omp master
{
- lincs_update_atoms_ind(li->th[li->nth].nind,
- li->th[li->nth].ind,
+ lincs_update_atoms_ind(li->task[li->ntask].nind,
+ li->task[li->ntask].ind,
li->bla, prefac, fac, r, invmass, x);
}
}
rvec *r;
real *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
- b0 = lincsd->th[th].b0;
- b1 = lincsd->th[th].b1;
+ b0 = lincsd->task[th].b0;
+ b1 = lincsd->task[th].b1;
bla = lincsd->bla;
r = lincsd->tmpv;
*/
calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
&pbc_simd,
- lincsd->th[th].simd_buf,
- lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+ lincsd->task[th].simd_buf,
+ lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
r, rhs1, sol);
#else /* LINCS_SIMD */
#endif /* LINCS_SIMD */
- /* We need a barrier, since the matrix construction below
- * can access entries in r of other threads.
- */
+ if (lincsd->bTaskDep)
+ {
+ /* We need a barrier, since the matrix construction below
+ * can access entries in r of other threads.
+ */
#pragma omp barrier
+ }
/* Construct the (sparse) LINCS matrix */
for (b = b0; b < b1; b++)
}
/* Together: 23*ncons + 6*nrtot flops */
- lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+ lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
/* nrec*(ncons+2*nrtot) flops */
if (econq == econqDeriv_FlexCon)
dhdlambda -= sol[b]*lincsd->ddist[b];
}
- lincsd->th[th].dhdlambda = dhdlambda;
+ lincsd->task[th].dhdlambda = dhdlambda;
}
if (bCalcVir)
real *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
int *nlocat;
- b0 = lincsd->th[th].b0;
- b1 = lincsd->th[th].b1;
+ b0 = lincsd->task[th].b0;
+ b1 = lincsd->task[th].b1;
bla = lincsd->bla;
r = lincsd->tmpv;
sol = lincsd->tmp3;
blc_sol = lincsd->tmp4;
mlambda = lincsd->mlambda;
-
- if (DOMAINDECOMP(cr) && cr->dd->constraints)
- {
- nlocat = dd_constraints_nlocalatoms(cr->dd);
- }
- else
- {
- nlocat = NULL;
- }
+ nlocat = lincsd->nlocat;
#ifdef LINCS_SIMD
*/
calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
&pbc_simd,
- lincsd->th[th].simd_buf,
- lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+ lincsd->task[th].simd_buf,
+ lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
r, rhs1, sol);
#else /* LINCS_SIMD */
#endif /* LINCS_SIMD */
- /* We need a barrier, since the matrix construction below
- * can access entries in r of other threads.
- */
+ if (lincsd->bTaskDep)
+ {
+ /* We need a barrier, since the matrix construction below
+ * can access entries in r of other threads.
+ */
#pragma omp barrier
+ }
/* Construct the (sparse) LINCS matrix */
for (b = b0; b < b1; b++)
}
/* Together: 26*ncons + 6*nrtot flops */
- lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+ lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
/* nrec*(ncons+2*nrtot) flops */
#ifndef LINCS_SIMD
dd_move_x_constraints(cr->dd, box, xp, NULL, FALSE);
}
}
+#pragma omp barrier
}
-
+ else if (lincsd->bTaskDep)
+ {
#pragma omp barrier
+ }
#ifdef LINCS_SIMD
calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
- lincsd->th[th].simd_buf, rhs1, sol, bWarn);
+ lincsd->task[th].simd_buf, rhs1, sol, bWarn);
#else
calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
rhs1, sol, bWarn);
/* 20*ncons flops */
#endif
- lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+ lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
/* nrec*(ncons+2*nrtot) flops */
#ifndef LINCS_SIMD
if (nlocat != NULL && (bCalcDHDL || bCalcVir))
{
- /* In lincs_update_atoms threads might cross-read mlambda */
+ if (lincsd->bTaskDep)
+ {
+ /* In lincs_update_atoms threads might cross-read mlambda */
#pragma omp barrier
+ }
/* Only account for local atoms */
for (b = b0; b < b1; b++)
*/
dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
}
- lincsd->th[th].dhdlambda = dhdl;
+ lincsd->task[th].dhdlambda = dhdl;
}
if (bCalcVir)
*/
}
-void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
+/* Sets the elements in the LINCS matrix for task li_task */
+static void set_lincs_matrix_task(struct gmx_lincsdata *li,
+ lincs_task_t *li_task,
+ const real *invmass,
+ int *ncc_triangle)
{
- int i, a1, a2, n, k, sign, center;
- int end, nk, kk;
- const real invsqrt2 = 0.7071067811865475244;
-
- for (i = 0; (i < li->nc); i++)
- {
- a1 = li->bla[2*i];
- a2 = li->bla[2*i+1];
- li->blc[i] = gmx_invsqrt(invmass[a1] + invmass[a2]);
- li->blc1[i] = invsqrt2;
- }
+ int i;
/* Construct the coupling coefficient matrix blmf */
- li->ntriangle = 0;
- li->ncc_triangle = 0;
- for (i = 0; (i < li->nc); i++)
+ li_task->ntriangle = 0;
+ *ncc_triangle = 0;
+ for (i = li_task->b0; i < li_task->b1; i++)
{
+ int a1, a2, n;
+
a1 = li->bla[2*i];
a2 = li->bla[2*i+1];
for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
{
+ int k, sign, center, end;
+
k = li->blbnb[n];
+
+ /* If we are using multiple, independent tasks for LINCS,
+ * the calls to check_assign_connected should have
+ * put all connected constraints in our task.
+ */
+ assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
+
if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
{
sign = -1;
li->blmf1[n] = sign*0.5;
if (li->ncg_triangle > 0)
{
+ int nk, kk;
+
/* Look for constraint triangles */
for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
{
if (kk != i && kk != k &&
(li->bla[2*kk] == end || li->bla[2*kk+1] == end))
{
- if (li->ntriangle == 0 ||
- li->triangle[li->ntriangle-1] < i)
+ /* If we are using multiple tasks for LINCS,
+ * the calls to check_assign_triangle should have
+ * put all constraints in the triangle in our task.
+ */
+ assert(k >= li_task->b0 && k < li_task->b1);
+ assert(kk >= li_task->b0 && kk < li_task->b1);
+
+ if (li_task->ntriangle == 0 ||
+ li_task->triangle[li_task->ntriangle - 1] < i)
{
/* Add this constraint to the triangle list */
- li->triangle[li->ntriangle] = i;
- li->tri_bits[li->ntriangle] = 0;
- li->ntriangle++;
- if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li->tri_bits[0])*8 - 1))
+ li_task->triangle[li_task->ntriangle] = i;
+ li_task->tri_bits[li_task->ntriangle] = 0;
+ li_task->ntriangle++;
+ if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
{
gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
li->blnr[i+1] - li->blnr[i],
- sizeof(li->tri_bits[0])*8-1);
+ sizeof(li_task->tri_bits[0])*8-1);
}
}
- li->tri_bits[li->ntriangle-1] |= (1<<(n-li->blnr[i]));
- li->ncc_triangle++;
+ li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
+ (*ncc_triangle)++;
}
}
}
}
}
+}
+
+/* Sets the elements in the LINCS matrix */
+void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
+{
+ int i;
+ const real invsqrt2 = 0.7071067811865475244;
+
+ for (i = 0; (i < li->nc); i++)
+ {
+ int a1, a2;
+
+ a1 = li->bla[2*i];
+ a2 = li->bla[2*i+1];
+ li->blc[i] = gmx_invsqrt(invmass[a1] + invmass[a2]);
+ li->blc1[i] = invsqrt2;
+ }
+
+ /* Construct the coupling coefficient matrix blmf */
+ int th, ntriangle = 0, ncc_triangle = 0;
+#pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
+ for (th = 0; th < li->ntask; th++)
+ {
+ set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
+ ntriangle = li->task[th].ntriangle;
+ }
+ li->ntriangle = ntriangle;
+ li->ncc_triangle = ncc_triangle;
if (debug)
{
gmx_bool bPLINCS, int nIter, int nProjOrder)
{
struct gmx_lincsdata *li;
- int mb;
+ int mt, mb;
gmx_moltype_t *molt;
+ gmx_bool bMoreThanTwoSeq;
if (fplog)
{
li->nIter = nIter;
li->nOrder = nProjOrder;
+ li->max_connect = 0;
+ for (mt = 0; mt < mtop->nmoltype; mt++)
+ {
+ int a;
+
+ molt = &mtop->moltype[mt];
+ for (a = 0; a < molt->atoms.nr; a++)
+ {
+ li->max_connect = std::max(li->max_connect,
+ at2con[mt].index[a + 1] - at2con[mt].index[a]);
+ }
+ }
+
li->ncg_triangle = 0;
- li->bCommIter = FALSE;
+ bMoreThanTwoSeq = FALSE;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
molt = &mtop->moltype[mtop->molblock[mb].type];
+
li->ncg_triangle +=
mtop->molblock[mb].nmol*
count_triangle_constraints(molt->ilist,
&at2con[mtop->molblock[mb].type]);
- if (bPLINCS && li->bCommIter == FALSE)
- {
- /* Check if we need to communicate not only before LINCS,
- * but also before each iteration.
- * The check for only two sequential constraints is only
- * useful for the common case of H-bond only constraints.
- * With more effort we could also make it useful for small
- * molecules with nr. sequential constraints <= nOrder-1.
- */
- li->bCommIter = (li->nOrder < 1 || more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]));
+
+ if (!bMoreThanTwoSeq &&
+ more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]))
+ {
+ bMoreThanTwoSeq = TRUE;
}
}
+
+ /* Check if we need to communicate not only before LINCS,
+ * but also before each iteration.
+ * The check for only two sequential constraints is only
+ * useful for the common case of H-bond only constraints.
+ * With more effort we could also make it useful for small
+ * molecules with nr. sequential constraints <= nOrder-1.
+ */
+ li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
+
if (debug && bPLINCS)
{
fprintf(debug, "PLINCS communication before each iteration: %d\n",
/* LINCS can run on any number of threads.
* Currently the number is fixed for the whole simulation,
* but it could be set in set_lincs().
+ * The current constraint to task assignment code can create independent
+ * tasks only when not more than two constraints are connected sequentially.
*/
- li->nth = gmx_omp_nthreads_get(emntLINCS);
+ li->ntask = gmx_omp_nthreads_get(emntLINCS);
+ li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
if (debug)
{
- fprintf(debug, "LINCS: using %d threads\n", li->nth);
+ fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
+ li->ntask, li->bTaskDep ? "" : "in");
}
- if (li->nth == 1)
+ if (li->ntask == 1)
{
- snew(li->th, 1);
+ snew(li->task, 1);
}
else
{
- /* Allocate an extra elements for "thread-overlap" constraints */
- snew(li->th, li->nth+1);
+ /* Allocate an extra elements for "task-overlap" constraints */
+ snew(li->task, li->ntask + 1);
}
int th;
-#pragma omp parallel for num_threads(li->nth)
- for (th = 0; th < li->nth; th++)
+#pragma omp parallel for num_threads(li->ntask)
+ for (th = 0; th < li->ntask; th++)
{
/* Per thread SIMD load buffer for loading 2 simd_width rvecs */
- snew_aligned(li->th[th].simd_buf, 2*simd_width*DIM,
+ snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
align_bytes);
}
/* Sets up the work division over the threads */
static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
{
- lincs_thread_t *li_m;
+ lincs_task_t *li_m;
int th;
gmx_bitmask_t *atf;
int a;
bitmask_clear(&atf[a]);
}
- if (li->nth > BITMASK_SIZE)
+ if (li->ntask > BITMASK_SIZE)
{
gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
}
- int block = simd_width;
-
- for (th = 0; th < li->nth; th++)
+ for (th = 0; th < li->ntask; th++)
{
- lincs_thread_t *li_th;
- int b;
+ lincs_task_t *li_task;
+ int b;
- li_th = &li->th[th];
-
- /* The constraints are divided equally over the threads,
- * in chunks of size block to ensure alignment for SIMD.
- */
- li_th->b0 = (((li->nc + block - 1)* th )/(block*li->nth))*block;
- li_th->b1 = (((li->nc + block - 1)*(th+1))/(block*li->nth))*block;
- li_th->b1 = std::min<int>(li_th->b1, li->nc);
+ li_task = &li->task[th];
/* For each atom set a flag for constraints from each */
- for (b = li_th->b0; b < li_th->b1; b++)
+ for (b = li_task->b0; b < li_task->b1; b++)
{
- bitmask_set_bit(&atf[li->bla[b*2]], th);
- bitmask_set_bit(&atf[li->bla[b*2+1]], th);
+ bitmask_set_bit(&atf[li->bla[b*2 ]], th);
+ bitmask_set_bit(&atf[li->bla[b*2 + 1]], th);
}
}
-#pragma omp parallel for num_threads(li->nth) schedule(static)
- for (th = 0; th < li->nth; th++)
+#pragma omp parallel for num_threads(li->ntask) schedule(static)
+ for (th = 0; th < li->ntask; th++)
{
- lincs_thread_t *li_th;
- gmx_bitmask_t mask;
- int b;
+ lincs_task_t *li_task;
+ gmx_bitmask_t mask;
+ int b;
- li_th = &li->th[th];
+ li_task = &li->task[th];
- if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
+ if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
{
- li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
- srenew(li_th->ind, li_th->ind_nalloc);
- srenew(li_th->ind_r, li_th->ind_nalloc);
+ li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
+ srenew(li_task->ind, li_task->ind_nalloc);
+ srenew(li_task->ind_r, li_task->ind_nalloc);
}
bitmask_init_low_bits(&mask, th);
- li_th->nind = 0;
- li_th->nind_r = 0;
- for (b = li_th->b0; b < li_th->b1; b++)
+ li_task->nind = 0;
+ li_task->nind_r = 0;
+ for (b = li_task->b0; b < li_task->b1; b++)
{
/* We let the constraint with the lowest thread index
* operate on atoms with constraints from multiple threads.
bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
{
/* Add the constraint to the local atom update index */
- li_th->ind[li_th->nind++] = b;
+ li_task->ind[li_task->nind++] = b;
}
else
{
/* Add the constraint to the rest block */
- li_th->ind_r[li_th->nind_r++] = b;
+ li_task->ind_r[li_task->nind_r++] = b;
}
}
}
/* We need to copy all constraints which have not be assigned
* to a thread to a separate list which will be handled by one thread.
*/
- li_m = &li->th[li->nth];
+ li_m = &li->task[li->ntask];
li_m->nind = 0;
- for (th = 0; th < li->nth; th++)
+ for (th = 0; th < li->ntask; th++)
{
- lincs_thread_t *li_th;
- int b;
+ lincs_task_t *li_task;
+ int b;
- li_th = &li->th[th];
+ li_task = &li->task[th];
- if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
+ if (li_m->nind + li_task->nind_r > li_m->ind_nalloc)
{
- li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
+ li_m->ind_nalloc = over_alloc_large(li_m->nind+li_task->nind_r);
srenew(li_m->ind, li_m->ind_nalloc);
}
- for (b = 0; b < li_th->nind_r; b++)
+ for (b = 0; b < li_task->nind_r; b++)
{
- li_m->ind[li_m->nind++] = li_th->ind_r[b];
+ li_m->ind[li_m->nind++] = li_task->ind_r[b];
}
if (debug)
{
fprintf(debug, "LINCS thread %d: %d constraints\n",
- th, li_th->nind);
+ th, li_task->nind);
}
}
snew_aligned(*ptr, nelem, align_bytes);
}
+static void assign_constraint(struct gmx_lincsdata *li,
+ int constraint_index,
+ int a1, int a2,
+ real lenA, real lenB,
+ const t_blocka *at2con)
+{
+ int con;
+
+ con = li->nc;
+
+ /* Make an mapping of local topology constraint index to LINCS index */
+ li->con_index[constraint_index] = con;
+
+ li->bllen0[con] = lenA;
+ li->ddist[con] = lenB - lenA;
+ /* Set the length to the topology A length */
+ li->bllen[con] = lenA;
+ li->bla[2*con] = a1;
+ li->bla[2*con+1] = a2;
+
+ /* Make space in the constraint connection matrix for constraints
+ * connected to both end of the current constraint.
+ */
+ li->ncc +=
+ at2con->index[a1 + 1] - at2con->index[a1] - 1 +
+ at2con->index[a2 + 1] - at2con->index[a2] - 1;
+
+ li->blnr[con + 1] = li->ncc;
+
+ /* Increase the constraint count */
+ li->nc++;
+}
+
+/* Check if constraint with topology index constraint_index is connected
+ * to other constraints, and if so add those connected constraints to our task.
+ */
+static void check_assign_connected(struct gmx_lincsdata *li,
+ const t_iatom *iatom,
+ const t_idef *idef,
+ int bDynamics,
+ int a1, int a2,
+ const t_blocka *at2con)
+{
+ /* Currently this function only supports constraint groups
+ * in which all constraints share at least one atom
+ * (e.g. H-bond constraints).
+ * Check both ends of the current constraint for
+ * connected constraints. We need to assign those
+ * to the same task.
+ */
+ int end;
+
+ for (end = 0; end < 2; end++)
+ {
+ int a, k;
+
+ a = (end == 0 ? a1 : a2);
+
+ for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
+ {
+ int cc;
+
+ cc = at2con->a[k];
+ /* Check if constraint cc has not yet been assigned */
+ if (li->con_index[cc] == -1)
+ {
+ int type;
+ real lenA, lenB;
+
+ type = iatom[cc*3];
+ lenA = idef->iparams[type].constr.dA;
+ lenB = idef->iparams[type].constr.dB;
+
+ if (bDynamics || lenA != 0 || lenB != 0)
+ {
+ assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
+ }
+ }
+ }
+ }
+}
+
+/* Check if constraint with topology index constraint_index is involved
+ * in a constraint triangle, and if so add the other two constraints
+ * in the triangle to our task.
+ */
+static void check_assign_triangle(struct gmx_lincsdata *li,
+ const t_iatom *iatom,
+ const t_idef *idef,
+ int bDynamics,
+ int constraint_index,
+ int a1, int a2,
+ const t_blocka *at2con)
+{
+ int nca, cc[32], ca[32], k;
+ int c_triangle[2] = { -1, -1 };
+
+ nca = 0;
+ for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
+ {
+ int c;
+
+ c = at2con->a[k];
+ if (c != constraint_index)
+ {
+ int aa1, aa2;
+
+ aa1 = iatom[c*3 + 1];
+ aa2 = iatom[c*3 + 2];
+ if (aa1 != a1)
+ {
+ cc[nca] = c;
+ ca[nca] = aa1;
+ nca++;
+ }
+ if (aa2 != a1)
+ {
+ cc[nca] = c;
+ ca[nca] = aa2;
+ nca++;
+ }
+ }
+ }
+
+ for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
+ {
+ int c;
+
+ c = at2con->a[k];
+ if (c != constraint_index)
+ {
+ int aa1, aa2, i;
+
+ aa1 = iatom[c*3 + 1];
+ aa2 = iatom[c*3 + 2];
+ if (aa1 != a2)
+ {
+ for (i = 0; i < nca; i++)
+ {
+ if (aa1 == ca[i])
+ {
+ c_triangle[0] = cc[i];
+ c_triangle[1] = c;
+ }
+ }
+ }
+ if (aa2 != a2)
+ {
+ for (i = 0; i < nca; i++)
+ {
+ if (aa2 == ca[i])
+ {
+ c_triangle[0] = cc[i];
+ c_triangle[1] = c;
+ }
+ }
+ }
+ }
+ }
+
+ if (c_triangle[0] >= 0)
+ {
+ int end;
+
+ for (end = 0; end < 2; end++)
+ {
+ /* Check if constraint c_triangle[end] has not yet been assigned */
+ if (li->con_index[c_triangle[end]] == -1)
+ {
+ int i, type;
+ real lenA, lenB;
+
+ i = c_triangle[end]*3;
+ type = iatom[i];
+ lenA = idef->iparams[type].constr.dA;
+ lenB = idef->iparams[type].constr.dB;
+
+ if (bDynamics || lenA != 0 || lenB != 0)
+ {
+ assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
+ }
+ }
+ }
+ }
+}
+
+static void set_matrix_indices(struct gmx_lincsdata *li,
+ const lincs_task_t *li_task,
+ const t_blocka *at2con,
+ gmx_bool bSortMatrix)
+{
+ int b;
+
+ for (b = li_task->b0; b < li_task->b1; b++)
+ {
+ int a1, a2, i, k;
+
+ a1 = li->bla[b*2];
+ a2 = li->bla[b*2 + 1];
+
+ i = li->blnr[b];
+ for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
+ {
+ int concon;
+
+ concon = li->con_index[at2con->a[k]];
+ if (concon != b)
+ {
+ li->blbnb[i++] = concon;
+ }
+ }
+ for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
+ {
+ int concon;
+
+ concon = li->con_index[at2con->a[k]];
+ if (concon != b)
+ {
+ li->blbnb[i++] = concon;
+ }
+ }
+
+ if (bSortMatrix)
+ {
+ /* Order the blbnb matrix to optimize memory access */
+ qsort(&(li->blbnb[li->blnr[b]]), li->blnr[b + 1] - li->blnr[b],
+ sizeof(li->blbnb[0]), int_comp);
+ }
+ }
+}
+
void set_lincs(t_idef *idef, t_mdatoms *md,
gmx_bool bDynamics, t_commrec *cr,
struct gmx_lincsdata *li)
{
- int start, natoms, nflexcon;
+ int natoms, nflexcon;
t_blocka at2con;
t_iatom *iatom;
- int i, k, ncc_alloc, ncon_tot, con, nconnect, concon;
- int type, a1, a2;
- real lenA = 0, lenB;
+ int i, ncc_alloc_old, ncon_tot;
- li->nc = 0;
- li->ncc = 0;
+ li->nc_real = 0;
+ li->nc = 0;
+ li->ncc = 0;
/* Zero the thread index ranges.
* Otherwise without local constraints we could return with old ranges.
*/
- for (i = 0; i < li->nth; i++)
+ for (i = 0; i < li->ntask; i++)
{
- li->th[i].b0 = 0;
- li->th[i].b1 = 0;
- li->th[i].nind = 0;
+ li->task[i].b0 = 0;
+ li->task[i].b1 = 0;
+ li->task[i].nind = 0;
}
- if (li->nth > 1)
+ if (li->ntask > 1)
{
- li->th[li->nth].nind = 0;
+ li->task[li->ntask].nind = 0;
}
/* This is the local topology, so there are only F_CONSTR constraints */
{
if (cr->dd->constraints)
{
+ int start;
+
dd_get_constraint_range(cr->dd, &start, &natoms);
}
else
{
natoms = cr->dd->nat_home;
}
- start = 0;
}
else
{
- start = 0;
natoms = md->homenr;
}
- at2con = make_at2con(start, natoms, idef->il, idef->iparams, bDynamics,
+ at2con = make_at2con(0, natoms, idef->il, idef->iparams, bDynamics,
&nflexcon);
ncon_tot = idef->il[F_CONSTR].nr/3;
- /* Ensure we have enough space for aligned loads beyond ncon_tot */
- if (ncon_tot + simd_width > li->nc_alloc || li->nc_alloc == 0)
+ /* Ensure we have enough padding for aligned loads for each thread */
+ if (ncon_tot + li->ntask*simd_width > li->nc_alloc || li->nc_alloc == 0)
{
- li->nc_alloc = over_alloc_dd(ncon_tot + simd_width);
+ li->nc_alloc = over_alloc_dd(ncon_tot + li->ntask*simd_width);
+ srenew(li->con_index, li->nc_alloc);
resize_real_aligned(&li->bllen0, li->nc_alloc);
resize_real_aligned(&li->ddist, li->nc_alloc);
srenew(li->bla, 2*li->nc_alloc);
resize_real_aligned(&li->blc, li->nc_alloc);
resize_real_aligned(&li->blc1, li->nc_alloc);
- srenew(li->blnr, li->nc_alloc+1);
+ srenew(li->blnr, li->nc_alloc + 1);
resize_real_aligned(&li->bllen, li->nc_alloc);
srenew(li->tmpv, li->nc_alloc);
+ if (DOMAINDECOMP(cr))
+ {
+ srenew(li->nlocat, li->nc_alloc);
+ }
resize_real_aligned(&li->tmp1, li->nc_alloc);
resize_real_aligned(&li->tmp2, li->nc_alloc);
resize_real_aligned(&li->tmp3, li->nc_alloc);
resize_real_aligned(&li->tmp4, li->nc_alloc);
resize_real_aligned(&li->mlambda, li->nc_alloc);
- if (li->ncg_triangle > 0)
- {
- /* This is allocating too much, but it is difficult to improve */
- srenew(li->triangle, li->nc_alloc);
- srenew(li->tri_bits, li->nc_alloc);
- }
}
iatom = idef->il[F_CONSTR].iatoms;
- ncc_alloc = li->ncc_alloc;
- li->blnr[0] = 0;
-
- con = 0;
- nconnect = 0;
- li->blnr[con] = nconnect;
- for (i = 0; i < ncon_tot; i++)
- {
- type = iatom[3*i];
- a1 = iatom[3*i+1];
- a2 = iatom[3*i+2];
- lenA = idef->iparams[type].constr.dA;
- lenB = idef->iparams[type].constr.dB;
- /* Skip the flexible constraints when not doing dynamics */
- if (bDynamics || lenA != 0 || lenB != 0)
- {
- li->bllen0[con] = lenA;
- li->ddist[con] = lenB - lenA;
- /* Set the length to the topology A length */
- li->bllen[con] = li->bllen0[con];
- li->bla[2*con] = a1;
- li->bla[2*con+1] = a2;
- /* Construct the constraint connection matrix blbnb */
- for (k = at2con.index[a1-start]; k < at2con.index[a1-start+1]; k++)
+ ncc_alloc_old = li->ncc_alloc;
+ li->blnr[0] = li->ncc;
+
+ /* Assign the constraints for li->ntask LINCS tasks.
+ * We target a uniform distribution of constraints over the tasks.
+ * Note that when flexible constraints are present, but are removed here
+ * (e.g. because we are doing EM) we get imbalance, but since that doesn't
+ * happen during normal MD, that's ok.
+ */
+ int ncon_assign, ncon_target, con, th;
+
+ /* Determine the number of constraints we need to assign here */
+ ncon_assign = ncon_tot;
+ if (!bDynamics)
+ {
+ /* With energy minimization, flexible constraints are ignored
+ * (and thus minimized, as they should be).
+ */
+ ncon_assign -= nflexcon;
+ }
+
+ /* Set the target constraint count per task to exactly uniform,
+ * this might be overridden below.
+ */
+ ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
+
+ /* Mark all constraints as unassigned by setting their index to -1 */
+ for (con = 0; con < ncon_tot; con++)
+ {
+ li->con_index[con] = -1;
+ }
+
+ con = 0;
+ for (th = 0; th < li->ntask; th++)
+ {
+ lincs_task_t *li_task;
+
+ li_task = &li->task[th];
+
+#ifdef LINCS_SIMD
+ /* With indepedent tasks we likely have H-bond constraints or constraint
+ * pairs. The connected constraints will be pulled into the task, so the
+ * constraints per task will often exceed ncon_target.
+ * Triangle constraints can also increase the count, but there are
+ * relatively few of those, so we usually expect to get ncon_target.
+ */
+ if (li->bTaskDep)
+ {
+ /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
+ * since otherwise a lot of operations can be wasted.
+ * There are several ways to round here, we choose the one
+ * that alternates block sizes, which helps with Intel HT.
+ */
+ ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
+ }
+#endif
+
+ /* Continue filling the arrays where we left off with the previous task,
+ * including padding for SIMD.
+ */
+ li_task->b0 = li->nc;
+
+ while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
+ {
+ if (li->con_index[con] == -1)
{
- concon = at2con.a[k];
- if (concon != i)
+ int type, a1, a2;
+ real lenA, lenB;
+
+ type = iatom[3*con];
+ a1 = iatom[3*con + 1];
+ a2 = iatom[3*con + 2];
+ lenA = idef->iparams[type].constr.dA;
+ lenB = idef->iparams[type].constr.dB;
+ /* Skip the flexible constraints when not doing dynamics */
+ if (bDynamics || lenA != 0 || lenB != 0)
{
- if (nconnect >= ncc_alloc)
+ assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
+
+ if (li->ntask > 1 && !li->bTaskDep)
{
- ncc_alloc = over_alloc_small(nconnect+1);
- srenew(li->blbnb, ncc_alloc);
+ /* We can generate independent tasks. Check if we
+ * need to assign connected constraints to our task.
+ */
+ check_assign_connected(li, iatom, idef, bDynamics,
+ a1, a2, &at2con);
}
- li->blbnb[nconnect++] = concon;
- }
- }
- for (k = at2con.index[a2-start]; k < at2con.index[a2-start+1]; k++)
- {
- concon = at2con.a[k];
- if (concon != i)
- {
- if (nconnect+1 > ncc_alloc)
+ if (li->ntask > 1 && li->ncg_triangle > 0)
{
- ncc_alloc = over_alloc_small(nconnect+1);
- srenew(li->blbnb, ncc_alloc);
+ /* Ensure constraints in one triangle are assigned
+ * to the same task.
+ */
+ check_assign_triangle(li, iatom, idef, bDynamics,
+ con, a1, a2, &at2con);
}
- li->blbnb[nconnect++] = concon;
}
}
- li->blnr[con+1] = nconnect;
- if (cr->dd == NULL)
+ con++;
+ }
+
+ li_task->b1 = li->nc;
+
+ if (simd_width > 1)
+ {
+ /* Copy the last atom pair indices and lengths for constraints
+ * up to a multiple of simd_width, such that we can do all
+ * SIMD operations without having to worry about end effects.
+ */
+ int i, last;
+
+ li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
+ last = li_task->b1 - 1;
+ for (i = li_task->b1; i < li->nc; i++)
{
- /* Order the blbnb matrix to optimize memory access */
- qsort(&(li->blbnb[li->blnr[con]]), li->blnr[con+1]-li->blnr[con],
- sizeof(li->blbnb[0]), int_comp);
+ li->bla[i*2 ] = li->bla[last*2 ];
+ li->bla[i*2 + 1] = li->bla[last*2 + 1];
+ li->bllen0[i] = li->bllen0[last];
+ li->ddist[i] = li->ddist[last];
+ li->bllen[i] = li->bllen[last];
+ li->blnr[i + 1] = li->blnr[last + 1];
}
- /* Increase the constraint count */
- con++;
}
+
+ /* Keep track of how many constraints we assigned */
+ li->nc_real += li_task->b1 - li_task->b0;
+
+ if (debug)
+ {
+ fprintf(debug, "LINCS task %d constraints %d - %d\n",
+ th, li_task->b0, li_task->b1);
+ }
+ }
+
+ assert(li->nc_real == ncon_assign);
+
+ gmx_bool bSortMatrix;
+
+ /* Without DD we order the blbnb matrix to optimize memory access.
+ * With DD the overhead of sorting is more than the gain during access.
+ */
+ bSortMatrix = !DOMAINDECOMP(cr);
+
+ if (li->ncc > li->ncc_alloc)
+ {
+ li->ncc_alloc = over_alloc_small(li->ncc);
+ srenew(li->blbnb, li->ncc_alloc);
}
- if (simd_width > 1)
+
+#pragma omp parallel for num_threads(li->ntask) schedule(static)
+ for (th = 0; th < li->ntask; th++)
{
- /* Copy the last atom pair indices and lengths for constraints
- * up to a multiple of simd_width, such that we can do all
- * SIMD operations without have to worry about end effects.
- */
- int con_roundup, i;
+ lincs_task_t *li_task;
+
+ li_task = &li->task[th];
- con_roundup = ((con + simd_width - 1)/simd_width)*simd_width;
- for (i = con; i < con_roundup; i++)
+ if (li->ncg_triangle > 0 &&
+ li_task->b1 - li_task->b0 > li_task->tri_alloc)
{
- li->bla[i*2 ] = li->bla[(con - 1)*2 ];
- li->bla[i*2 + 1] = li->bla[(con - 1)*2 + 1];
- li->bllen[i] = li->bllen[con - 1];
+ /* This is allocating too much, but it is difficult to improve */
+ li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
+ srenew(li_task->triangle, li_task->tri_alloc);
+ srenew(li_task->tri_bits, li_task->tri_alloc);
}
+
+ set_matrix_indices(li, li_task, &at2con, bSortMatrix);
}
done_blocka(&at2con);
- /* This is the real number of constraints,
- * without dynamics the flexible constraints are not present.
- */
- li->nc = con;
-
- li->ncc = li->blnr[con];
if (cr->dd == NULL)
{
- /* Since the matrix is static, we can free some memory */
- ncc_alloc = li->ncc;
- srenew(li->blbnb, ncc_alloc);
+ /* Since the matrix is static, we should free some memory */
+ li->ncc_alloc = li->ncc;
+ srenew(li->blbnb, li->ncc_alloc);
}
- if (ncc_alloc > li->ncc_alloc)
+ if (li->ncc_alloc > ncc_alloc_old)
{
- li->ncc_alloc = ncc_alloc;
srenew(li->blmf, li->ncc_alloc);
srenew(li->blmf1, li->ncc_alloc);
srenew(li->tmpncc, li->ncc_alloc);
}
- if (debug)
+ if (DOMAINDECOMP(cr) && dd_constraints_nlocalatoms(cr->dd) != NULL)
+ {
+ int *nlocat_dd;
+
+ nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
+
+ /* Convert nlocat from local topology to LINCS constraint indexing */
+ for (con = 0; con < ncon_tot; con++)
+ {
+ li->nlocat[li->con_index[con]] = nlocat_dd[con];
+ }
+ }
+ else
{
- fprintf(debug, "Number of constraints is %d, couplings %d\n",
- li->nc, li->ncc);
+ li->nlocat = NULL;
}
- if (li->nth == 1)
+ if (debug)
{
- li->th[0].b0 = 0;
- li->th[0].b1 = li->nc;
+ fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
+ li->nc_real, li->nc, li->ncc);
}
- else
+
+ if (li->ntask > 1)
{
lincs_thread_setup(li, md->nr);
}
}
}
-static void cconerr(gmx_domdec_t *dd,
- int ncons, int *bla, real *bllen, rvec *x, t_pbc *pbc,
+static void cconerr(const struct gmx_lincsdata *lincsd,
+ rvec *x, t_pbc *pbc,
real *ncons_loc, real *ssd, real *max, int *imax)
{
- real len, d, ma, ssd2, r2;
- int *nlocat, count, b, im;
- rvec dx;
+ const int *bla, *nlocat;
+ const real *bllen;
+ real ma, ssd2;
+ int count, im, task;
- if (dd && dd->constraints)
- {
- nlocat = dd_constraints_nlocalatoms(dd);
- }
- else
- {
- nlocat = 0;
- }
+ bla = lincsd->bla;
+ bllen = lincsd->bllen;
+ nlocat = lincsd->nlocat;
ma = 0;
ssd2 = 0;
im = 0;
count = 0;
- for (b = 0; b < ncons; b++)
+ for (task = 0; task < lincsd->ntask; task++)
{
- if (pbc)
- {
- pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
- }
- else
- {
- rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
- }
- r2 = norm2(dx);
- len = r2*gmx_invsqrt(r2);
- d = fabs(len/bllen[b]-1);
- if (d > ma && (nlocat == NULL || nlocat[b]))
- {
- ma = d;
- im = b;
- }
- if (nlocat == NULL)
- {
- ssd2 += d*d;
- count++;
- }
- else
+ int b;
+
+ for (b = lincsd->task[task].b0; b < lincsd->task[task].b1; b++)
{
- ssd2 += nlocat[b]*d*d;
- count += nlocat[b];
+ real len, d, r2;
+ rvec dx;
+
+ if (pbc)
+ {
+ pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
+ }
+ else
+ {
+ rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
+ }
+ r2 = norm2(dx);
+ len = r2*gmx_invsqrt(r2);
+ d = fabs(len/bllen[b]-1);
+ if (d > ma && (nlocat == NULL || nlocat[b]))
+ {
+ ma = d;
+ im = b;
+ }
+ if (nlocat == NULL)
+ {
+ ssd2 += d*d;
+ count++;
+ }
+ else
+ {
+ ssd2 += nlocat[b]*d*d;
+ count += nlocat[b];
+ }
}
}
if (bLog && fplog)
{
- cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+ cconerr(lincsd, xprime, pbc,
&ncons_loc, &p_ssd, &p_max, &p_imax);
}
bWarn = FALSE;
/* The OpenMP parallel region of constrain_lincs for coords */
-#pragma omp parallel num_threads(lincsd->nth)
+#pragma omp parallel num_threads(lincsd->ntask)
{
int th = gmx_omp_get_thread_num();
- clear_mat(lincsd->th[th].vir_r_m_dr);
+ clear_mat(lincsd->task[th].vir_r_m_dr);
do_lincs(x, xprime, box, pbc, lincsd, th,
md->invmass, cr,
bCalcDHDL,
ir->LincsWarnAngle, &bWarn,
invdt, v, bCalcVir,
- th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
+ th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
}
if (bLog && fplog && lincsd->nc > 0)
}
if (bLog || bEner)
{
- cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+ cconerr(lincsd, xprime, pbc,
&ncons_loc, &p_ssd, &p_max, &p_imax);
/* Check if we are doing the second part of SD */
if (ir->eI == eiSD2 && v == NULL)
{
if (maxwarn >= 0)
{
- cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+ cconerr(lincsd, xprime, pbc,
&ncons_loc, &p_ssd, &p_max, &p_imax);
if (MULTISIM(cr))
{
else
{
/* The OpenMP parallel region of constrain_lincs for derivatives */
-#pragma omp parallel num_threads(lincsd->nth)
+#pragma omp parallel num_threads(lincsd->ntask)
{
int th = gmx_omp_get_thread_num();
do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
md->invmass, econq, bCalcDHDL,
- bCalcVir, th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
+ bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
}
}
int th;
dhdlambda = 0;
- for (th = 0; th < lincsd->nth; th++)
+ for (th = 0; th < lincsd->ntask; th++)
{
- dhdlambda += lincsd->th[th].dhdlambda;
+ dhdlambda += lincsd->task[th].dhdlambda;
}
if (econqCoord)
{
*dvdlambda += dhdlambda;
}
- if (bCalcVir && lincsd->nth > 1)
+ if (bCalcVir && lincsd->ntask > 1)
{
- for (i = 1; i < lincsd->nth; i++)
+ for (i = 1; i < lincsd->ntask; i++)
{
- m_add(vir_r_m_dr, lincsd->th[i].vir_r_m_dr, vir_r_m_dr);
+ m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
}
}
/* count assuming nit=1 */
- inc_nrnb(nrnb, eNR_LINCS, lincsd->nc);
+ inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
if (lincsd->ntriangle > 0)
{
}
if (v)
{
- inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc*2);
+ inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
}
if (bCalcVir)
{
- inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc);
+ inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
}
return bOK;