#ifdef HAVE__FINITE
#include <float.h>
#endif
+#include <assert.h>
int gmx_nint(real a)
{
return gmx_within_tol(a, 0.0, GMX_REAL_MIN/GMX_REAL_EPS);
}
-real
-gmx_log2(real x)
+unsigned int
+gmx_log2i(unsigned int n)
{
- const real iclog2 = 1.0/log( 2.0 );
+ assert(n != 0); /* behavior differs for 0 */
+#if defined(__INTEL_COMPILER)
+ return _bit_scan_reverse(n);
+#elif defined(__GNUC__) && UINT_MAX == 4294967295U /*also for clang*/
+ return __builtin_clz(n) ^ 31U; /* xor gets optimized out */
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+ {
+ unsigned long i;
+ _BitScanReverse(&i, n);
+ return i;
+ }
+#elif defined(__xlC__)
+ return 31 - __cntlz4(n);
+#else
+ /* http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup */
+#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
+ static const char LogTable256[256] = {
+ -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+ LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
+ LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
+ };
+#undef LT
+
+ unsigned int r; /* r will be lg(n) */
+ unsigned int t, tt; /* temporaries */
- return log( x ) * iclog2;
+ if ((tt = n >> 16) != 0)
+ {
+ r = ((t = tt >> 8) != 0) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
+ }
+ else
+ {
+ r = ((t = n >> 8) != 0) ? 8 + LogTable256[t] : LogTable256[n];
+ }
+ return r;
+#endif
}
gmx_bool
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
#include <string.h>
+#include <assert.h>
#include "smalloc.h"
#include "macros.h"
#include "vec.h"
#include "nbnxn_consts.h"
#include "nbnxn_internal.h"
#include "nbnxn_search.h"
+#include "gromacs/utility/gmxomp.h"
#include "gmx_omp_nthreads.h"
/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
nbnxn_alloc_t *alloc,
nbnxn_free_t *free)
{
- int i, j;
+ int i, j, nth;
real c6, c12, tol;
char *ptr;
gmx_bool simple, bCombGeom, bCombLB;
}
nbat->buffer_flags.flag = NULL;
nbat->buffer_flags.flag_nalloc = 0;
+
+ nth = gmx_omp_nthreads_get(emntNonbonded);
+
+ ptr = getenv("GMX_USE_TREEREDUCE");
+ if (ptr != NULL)
+ {
+ nbat->bUseTreeReduce = strtol(ptr, 0, 10);
+ }
+#if defined __MIC__
+ else if (nth > 8) /*on the CPU we currently don't benefit even at 32*/
+ {
+ nbat->bUseTreeReduce = 1;
+ }
+#endif
+ else
+ {
+ nbat->bUseTreeReduce = 0;
+ }
+ if (nbat->bUseTreeReduce)
+ {
+ if (fp)
+ {
+ fprintf(fp, "Using tree force reduction\n\n");
+ }
+ snew(nbat->syncStep, nth);
+ }
}
static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
}
}
+static gmx_inline unsigned char reverse_bits(unsigned char b)
+{
+ /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
+ return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nbat,
+ int nth)
+{
+ const nbnxn_buffer_flags_t *flags = &nbat->buffer_flags;
+
+ int next_pow2 = 1<<(gmx_log2i(nth-1)+1);
+
+ assert(nbat->nout == nth); /* tree-reduce currently only works for nout==nth */
+
+ memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep))*nth);
+
+#pragma omp parallel num_threads(nth)
+ {
+ int b0, b1, b;
+ int i0, i1;
+ int group_size, th;
+
+ th = gmx_omp_get_thread_num();
+
+ for (group_size = 2; group_size < 2*next_pow2; group_size *= 2)
+ {
+ int index[2], group_pos, partner_pos, wu;
+ int partner_th = th ^ (group_size/2);
+
+ if (group_size > 2)
+ {
+#ifdef TMPI_ATOMICS
+ /* wait on partner thread - replaces full barrier */
+ int sync_th, sync_group_size;
+
+ tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
+ tMPI_Atomic_set(&(nbat->syncStep[th]), group_size/2); /* mark previous step as completed */
+
+ /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
+ for (sync_th = partner_th, sync_group_size = group_size; sync_th >= nth && sync_group_size > 2; sync_group_size /= 2)
+ {
+ sync_th &= ~(sync_group_size/4);
+ }
+ if (sync_th < nth) /* otherwise nothing to sync index[1] will be >=nout */
+ {
+ /* wait on the thread which computed input data in previous step */
+ while (tMPI_Atomic_get((volatile tMPI_Atomic_t*)&(nbat->syncStep[sync_th])) < group_size/2)
+ {
+ gmx_pause();
+ }
+ /* guarantee that no later load happens before wait loop is finisehd */
+ tMPI_Atomic_memory_barrier();
+ }
+#else /* TMPI_ATOMICS */
+#pragma omp barrier
+#endif
+ }
+
+ /* Calculate buffers to sum (result goes into first buffer) */
+ group_pos = th % group_size;
+ index[0] = th - group_pos;
+ index[1] = index[0] + group_size/2;
+
+ /* If no second buffer, nothing to do */
+ if (index[1] >= nbat->nout && group_size > 2)
+ {
+ continue;
+ }
+
+#if NBNXN_BUFFERFLAG_MAX_THREADS > 256
+#error reverse_bits assumes max 256 threads
+#endif
+ /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
+ This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
+ The permutation which allows this corresponds to reversing the bits of the group position.
+ */
+ group_pos = reverse_bits(group_pos)/(256/group_size);
+
+ partner_pos = group_pos ^ 1;
+
+ /* loop over two work-units (own and partner) */
+ for (wu = 0; wu < 2; wu++)
+ {
+ if (wu == 1)
+ {
+ if (partner_th < nth)
+ {
+ break; /* partner exists we don't have to do his work */
+ }
+ else
+ {
+ group_pos = partner_pos;
+ }
+ }
+
+ /* Calculate the cell-block range for our thread */
+ b0 = (flags->nflag* group_pos )/group_size;
+ b1 = (flags->nflag*(group_pos+1))/group_size;
+
+ for (b = b0; b < b1; b++)
+ {
+ i0 = b *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+ i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+
+ if ((flags->flag[b] & (1ULL<<index[1])) || group_size > 2)
+ {
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_atomdata_reduce_reals_simd
+#else
+ nbnxn_atomdata_reduce_reals
+#endif
+ (nbat->out[index[0]].f,
+ (flags->flag[b] & (1ULL<<index[0])) || group_size > 2,
+ &(nbat->out[index[1]].f), 1, i0, i1);
+
+ }
+ else if (!(flags->flag[b] & (1ULL<<index[0])))
+ {
+ nbnxn_atomdata_clear_reals(nbat->out[index[0]].f,
+ i0, i1);
+ }
+ }
+ }
+ }
+ }
+}
+
+
+static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nbat,
+ int nth)
+{
+ int th;
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (th = 0; th < nth; th++)
+ {
+ const nbnxn_buffer_flags_t *flags;
+ int b0, b1, b;
+ int i0, i1;
+ int nfptr;
+ real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
+ int out;
+
+ flags = &nbat->buffer_flags;
+
+ /* Calculate the cell-block range for our thread */
+ b0 = (flags->nflag* th )/nth;
+ b1 = (flags->nflag*(th+1))/nth;
+
+ for (b = b0; b < b1; b++)
+ {
+ i0 = b *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+ i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+
+ nfptr = 0;
+ for (out = 1; out < nbat->nout; out++)
+ {
+ if (flags->flag[b] & (1U<<out))
+ {
+ fptr[nfptr++] = nbat->out[out].f;
+ }
+ }
+ if (nfptr > 0)
+ {
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_atomdata_reduce_reals_simd
+#else
+ nbnxn_atomdata_reduce_reals
+#endif
+ (nbat->out[0].f,
+ flags->flag[b] & (1U<<0),
+ fptr, nfptr,
+ i0, i1);
+ }
+ else if (!(flags->flag[b] & (1U<<0)))
+ {
+ nbnxn_atomdata_clear_reals(nbat->out[0].f,
+ i0, i1);
+ }
+ }
+ }
+}
+
/* Add the force array(s) from nbnxn_atomdata_t to f */
void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
int locality,
/* Reduce the force thread output buffers into buffer 0, before adding
* them to the, differently ordered, "real" force buffer.
*/
-#pragma omp parallel for num_threads(nth) schedule(static)
- for (th = 0; th < nth; th++)
+ if (nbat->bUseTreeReduce)
{
- const nbnxn_buffer_flags_t *flags;
- int b0, b1, b;
- int i0, i1;
- int nfptr;
- real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
- int out;
-
- flags = &nbat->buffer_flags;
-
- /* Calculate the cell-block range for our thread */
- b0 = (flags->nflag* th )/nth;
- b1 = (flags->nflag*(th+1))/nth;
-
- for (b = b0; b < b1; b++)
- {
- i0 = b *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
- i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
-
- nfptr = 0;
- for (out = 1; out < nbat->nout; out++)
- {
- if (flags->flag[b] & (1U<<out))
- {
- fptr[nfptr++] = nbat->out[out].f;
- }
- }
- if (nfptr > 0)
- {
-#ifdef GMX_NBNXN_SIMD
- nbnxn_atomdata_reduce_reals_simd
-#else
- nbnxn_atomdata_reduce_reals
-#endif
- (nbat->out[0].f,
- flags->flag[b] & (1U<<0),
- fptr, nfptr,
- i0, i1);
- }
- else if (!(flags->flag[b] & (1U<<0)))
- {
- nbnxn_atomdata_clear_reals(nbat->out[0].f,
- i0, i1);
- }
- }
+ nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
+ }
+ else
+ {
+ nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
}
}
-
#pragma omp parallel for num_threads(nth) schedule(static)
for (th = 0; th < nth; th++)
{