In CUDA, the same shared memory buffer is used to store different
intermediate values. To avoid overwriting these values prematurely,
extra blocking synchronizations are needed in the GPU kernel.
This bug was not exposed neither by regression tests nor by unit tests
because it was affecting values rarely and the deviation were within
tolerances.
* Inverse matrix using a set of expansionOrder matrix multiplications
*/
* Inverse matrix using a set of expansionOrder matrix multiplications
*/
+ // Make sure that we don't overwrite the sm_r[..] array.
+ __syncthreads();
+
// This will use the same memory space as sm_r, which is no longer needed.
extern __shared__ float sm_rhs[];
// Save current right-hand-side vector in the shared memory
// This will use the same memory space as sm_r, which is no longer needed.
extern __shared__ float sm_rhs[];
// Save current right-hand-side vector in the shared memory
// 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
// lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
// The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
// 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
// lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
// The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
- // two are no longer in use.
+ // two are no longer in use, which we make sure by waiting for all threads in block.
+ __syncthreads();
extern __shared__ float sm_threadVirial[];
float mult = targetLength * lagrangeScaled;
sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
extern __shared__ float sm_threadVirial[];
float mult = targetLength * lagrangeScaled;
sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;