*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/legacyheaders/names.h"
#include "gromacs/math/vec.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
/* these MUST correspond to the enum in hackblock.h */
return bRet;
}
-gmx_bool rbonded_atoms_exist_in_list(t_rbonded *b, t_rbonded blist[], int nlist, int natoms)
+int
+rbonded_find_atoms_in_list(t_rbonded *b, t_rbonded blist[], int nlist, int natoms)
{
int i, k;
- gmx_bool matchFound = FALSE;
+ int foundPos = -1;
gmx_bool atomsMatch;
- for (i = 0; i < nlist && !matchFound; i++)
+ for (i = 0; i < nlist && foundPos < 0; i++)
{
atomsMatch = TRUE;
for (k = 0; k < natoms && atomsMatch; k++)
atomsMatch = atomsMatch && !strcmp(b->a[k], blist[i].a[natoms-1-k]);
}
}
- matchFound = atomsMatch;
+ if (atomsMatch)
+ {
+ foundPos = i;
+ /* If all the atoms AND all the parameters match, it is likely that
+ * the user made a copy-and-paste mistake (since it would be much cheaper
+ * to just bump the force constant 2x if you really want it twice).
+ * Since we only have the unparsed string here we can only detect
+ * EXACT matches (including identical whitespace).
+ */
+ if (!strcmp(b->s, blist[i].s))
+ {
+ gmx_warning("Duplicate line found in or between hackblock and rtp entries");
+ }
+ }
}
- return matchFound;
+ return foundPos;
}
gmx_bool merge_t_bondeds(t_rbondeds s[], t_rbondeds d[], gmx_bool bMin, gmx_bool bPlus)
{
int i, j;
gmx_bool bBondsRemoved;
+ int nbHackblockStart;
+ int index;
bBondsRemoved = FALSE;
for (i = 0; i < ebtsNR; i++)
{
if (s[i].nb > 0)
{
+ /* Record how many bonds we have in the destination when we start.
+ *
+ * If an entry is present in the hackblock (destination), we will
+ * not add the one from the main rtp, since the point is for hackblocks
+ * to overwrite it. However, if there is no hackblock entry we do
+ * allow multiple main rtp entries since some forcefield insist on that.
+ *
+ * We accomplish this by checking the position we find an entry in,
+ * rather than merely checking whether it exists at all.
+ * If that index is larger than the original (hackblock) destination
+ * size, it was added from the main rtp, and then we will allow more
+ * such entries. In contrast, if the entry found has a lower index
+ * it is a hackblock entry meant to override the main rtp, and then
+ * we don't add the main rtp one.
+ */
+ nbHackblockStart = d[i].nb;
+
/* make space */
srenew(d[i].b, d[i].nb + s[i].nb);
for (j = 0; j < s[i].nb; j++)
{
/* Check if this bonded string already exists before adding.
- * We are merging from the main rtp to the hackblocks, so this
- * will mean the hackblocks overwrite the man rtp, as intended.
+ * We are merging from the main RTP to the hackblocks, so this
+ * will mean the hackblocks overwrite the man RTP, as intended.
+ */
+ index = rbonded_find_atoms_in_list(&s[i].b[j], d[i].b, d[i].nb, btsNiatoms[i]);
+ /* - If we did not find this interaction at all, the index will be -1,
+ * and then we should definitely add it to the merged hackblock and rtp.
+ *
+ * Alternatively, if it was found, index will be >=0.
+ * - In case this index is lower than the original number of entries,
+ * it is already present as a *hackblock* entry, and those should
+ * always override whatever we have listed in the RTP. Thus, we
+ * should just keep that one and not add anything from the RTP.
+ * - Finally, if it was found, but with an index higher than
+ * the original number of entries, it comes from the RTP rather
+ * than hackblock, and then we must have added it ourselves
+ * in a previous iteration. In that case it is a matter of
+ * several entries for the same sequence of atoms, and we allow
+ * that in the RTP. In this case we should simply copy all of
+ * them, including this one.
*/
- if (!rbonded_atoms_exist_in_list(&s[i].b[j], d[i].b, d[i].nb, btsNiatoms[i]))
+ if (index < 0 || index >= nbHackblockStart)
{
if (!(bMin && contains_char(&s[i].b[j], '-'))
&& !(bPlus && contains_char(&s[i].b[j], '+')))
bBondsRemoved = TRUE;
}
}
+ else
+ {
+ /* This is the common case where a hackblock entry simply
+ * overrides the RTP, so we cannot warn here.
+ */
+ }
}
}
}
These operations are issued in the local stream at the beginning of the step
and therefore always complete before the local kernel launch. The non-local
- kernel is launched after the local on the same device/context, so this is
+ kernel is launched after the local on the same device/context hence it is
inherently scheduled after the operations in the local stream (including the
- above "misc_ops").
- However, for the sake of having a future-proof implementation, we use the
- misc_ops_done event to record the point in time when the above operations
- are finished and synchronize with this event in the non-local stream.
+ above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
+ devices with multiple hardware queues the dependency needs to be enforced.
+ We use the misc_ops_and_local_H2D_done event to record the point where
+ the local x+q H2D (and all preceding) tasks are complete and synchronize
+ with this event in the non-local stream before launching the non-bonded kernel.
*/
void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
const nbnxn_atomdata_t *nbatom,
adat_len = adat->natoms - adat->natoms_local;
}
- /* When we get here all misc operations issues in the local stream are done,
- so we record that in the local stream and wait for it in the nonlocal one. */
- if (nb->bUseTwoStreams)
- {
- if (iloc == eintLocal)
- {
- stat = cudaEventRecord(nb->misc_ops_done, stream);
- CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
- }
- else
- {
- stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
- }
- }
-
/* beginning of timed HtoD section */
if (bDoTime)
{
cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
adat_len * sizeof(*adat->xq), stream);
+ /* When we get here all misc operations issues in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one. */
+ if (nb->bUseTwoStreams)
+ {
+ if (iloc == eintLocal)
+ {
+ stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+ }
+ else
+ {
+ stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+ }
+ }
+
if (bDoTime)
{
stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
/* init events for sychronization (timing disabled for performance reasons!) */
stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
- stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
- CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+ stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
/* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
* (which increases with shorter time/step) caused by a known CUDA driver bug.
stat = cudaEventDestroy(nb->nonlocal_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
- stat = cudaEventDestroy(nb->misc_ops_done);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+ stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
if (nb->bDoTime)
{