snew(ekind->ekin_work_alloc, nthread);
snew(ekind->ekin_work, nthread);
+ snew(ekind->dekindl_work, nthread);
#pragma omp parallel for num_threads(nthread) schedule(static)
for (thread = 0; thread < nthread; thread++)
{
- /* Allocate 2 elements extra on both sides,
- * so in single precision we have 2*3*3*4=72 bytes buffer
- * on both sides to avoid cache pollution.
+#define EKIN_WORK_BUFFER_SIZE 2
+ /* Allocate 2 extra elements on both sides, so in single
+ * precision we have
+ * EKIN_WORK_BUFFER_SIZE*DIM*DIM*sizeof(real) = 72/144 bytes
+ * buffer on both sides to avoid cache pollution.
*/
- snew(ekind->ekin_work_alloc[thread], ekind->ngtc+4);
- ekind->ekin_work[thread] = ekind->ekin_work_alloc[thread] + 2;
+ snew(ekind->ekin_work_alloc[thread], ekind->ngtc+2*EKIN_WORK_BUFFER_SIZE);
+ ekind->ekin_work[thread] = ekind->ekin_work_alloc[thread] + EKIN_WORK_BUFFER_SIZE;
+ /* Nasty hack so we can have the per-thread accumulation
+ * variable for dekindl in the same thread-local cache lines
+ * as the per-thread accumulation tensors for ekin[fh],
+ * because they are accumulated in the same loop. */
+ ekind->dekindl_work[thread] = &(ekind->ekin_work[thread][ekind->ngtc][0][0]);
+#undef EKIN_WORK_BUFFER_SIZE
}
ekind->ngacc = opts->ngacc;
}
}
else
-
{
/* Calculate the full step Ekin as the average of the half steps */
for (j = 0; (j < DIM); j++)
}
if (dekindlambda)
{
- *dekindlambda = 0.5*(ekind->dekindl + ekind->dekindl_old);
+ if (bEkinAveVel)
+ {
+ *dekindlambda = ekind->dekindl;
+ }
+ else
+ {
+ *dekindlambda = 0.5*(ekind->dekindl + ekind->dekindl_old);
+ }
}
return T;
}