#include "gpu_utils.h"
#include "../cuda_tools/cudautils.cuh"
-#include "memtestG80_core.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
-/** Amount of memory to be used in quick memtest. */
-#define QUICK_MEM 250
-/** Bit flag with type of tests to run in quick memtest. */
-#define QUICK_TESTS MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
-/** Number of iterations in quick memtest. */
-#define QUICK_ITER 3
-
-/** Bitflag with all test set on for full memetest. */
-#define FULL_TESTS 0x3FFF
-/** Number of iterations in full memtest. */
-#define FULL_ITER 25
-
-/** Bit flag with type of tests to run in time constrained memtest. */
-#define TIMED_TESTS MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
-
/*! \brief
* Max number of devices supported by CUDA (for consistency checking).
*
}
-/** Bit-flags which refer to memtestG80 test types and are used in do_memtest
- * to specify which tests to run. */
-enum memtest_G80_test_types {
- MOVING_INVERSIONS_10 = 0x1,
- MOVING_INVERSIONS_RAND = 0x2,
- WALKING_8BIT_M86 = 0x4,
- WALKING_0_8BIT = 0x8,
- WALKING_1_8BIT = 0x10,
- WALKING_0_32BIT = 0x20,
- WALKING_1_32BIT = 0x40,
- RANDOM_BLOCKS = 0x80,
- MOD_20_32BIT = 0x100,
- LOGIC_1_ITER = 0x200,
- LOGIC_4_ITER = 0x400,
- LOGIC_1_ITER_SHMEM = 0x800,
- LOGIC_4_ITER_SHMEM = 0x1000
-};
-
-
/*!
* \brief Runs GPU sanity checks.
*
return 0;
}
-
-/*!
- * \brief Runs a set of memory tests specified by the given bit-flags.
- * Tries to allocate and do the test on \p megs Mb memory or
- * the greatest amount that can be allocated (>10Mb).
- * In case if an error is detected it stops without finishing the remaining
- * steps/iterations and returns greater then zero value.
- * In case of other errors (e.g. kernel launch errors, device querying errors)
- * -1 is returned.
- *
- * \param[in] which_tests variable with bit-flags of the requested tests
- * \param[in] megs amount of memory that will be tested in MB
- * \param[in] iter number of iterations
- * \returns 0 if no error was detected, otherwise >0
- */
-static int do_memtest(unsigned int which_tests, int megs, int iter)
-{
- memtestState tester;
- int i;
- uint err_count; //, err_iter;
-
- // no parameter check as this fn won't be called externally
-
- // let's try to allocate the mem
- while (!tester.allocate(megs) && (megs - 10 > 0))
- {
- megs -= 10; tester.deallocate();
- }
-
- if (megs <= 10)
- {
- fprintf(stderr, "Unable to allocate GPU memory!\n");
- return -1;
- }
-
- // clear the first 18 bits
- which_tests &= 0x3FFF;
- for (i = 0; i < iter; i++)
- {
- // Moving Inversions (ones and zeros)
- if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
- {
- tester.gpuMovingInversionsOnesZeros(err_count);
- if (err_count > 0)
- {
- return MOVING_INVERSIONS_10;
- }
- }
- // Moving Inversions (random)
- if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
- {
- tester.gpuMovingInversionsRandom(err_count);
- if (err_count > 0)
- {
- return MOVING_INVERSIONS_RAND;
- }
- }
- // Memtest86 Walking 8-bit
- if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
- {
- for (uint shift = 0; shift < 8; shift++)
- {
- tester.gpuWalking8BitM86(err_count, shift);
- if (err_count > 0)
- {
- return WALKING_8BIT_M86;
- }
- }
- }
- // True Walking zeros (8-bit)
- if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
- {
- for (uint shift = 0; shift < 8; shift++)
- {
- tester.gpuWalking8Bit(err_count, false, shift);
- if (err_count > 0)
- {
- return WALKING_0_8BIT;
- }
- }
- }
- // True Walking ones (8-bit)
- if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
- {
- for (uint shift = 0; shift < 8; shift++)
- {
- tester.gpuWalking8Bit(err_count, true, shift);
- if (err_count > 0)
- {
- return WALKING_1_8BIT;
- }
- }
- }
- // Memtest86 Walking zeros (32-bit)
- if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
- {
- for (uint shift = 0; shift < 32; shift++)
- {
- tester.gpuWalking32Bit(err_count, false, shift);
- if (err_count > 0)
- {
- return WALKING_0_32BIT;
- }
- }
- }
- // Memtest86 Walking ones (32-bit)
- if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
- {
- for (uint shift = 0; shift < 32; shift++)
- {
- tester.gpuWalking32Bit(err_count, true, shift);
- if (err_count > 0)
- {
- return WALKING_1_32BIT;
- }
- }
- }
- // Random blocks
- if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
- {
- tester.gpuRandomBlocks(err_count, rand());
- if (err_count > 0)
- {
- return RANDOM_BLOCKS;
- }
-
- }
-
- // Memtest86 Modulo-20
- if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
- {
- for (uint shift = 0; shift < 20; shift++)
- {
- tester.gpuModuloX(err_count, shift, rand(), 20, 2);
- if (err_count > 0)
- {
- return MOD_20_32BIT;
- }
- }
- }
- // Logic (one iteration)
- if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
- {
- tester.gpuShortLCG0(err_count, 1);
- if (err_count > 0)
- {
- return LOGIC_1_ITER;
- }
- }
- // Logic (4 iterations)
- if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
- {
- tester.gpuShortLCG0(err_count, 4);
- if (err_count > 0)
- {
- return LOGIC_4_ITER;
- }
-
- }
- // Logic (shared memory, one iteration)
- if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
- {
- tester.gpuShortLCG0Shmem(err_count, 1);
- if (err_count > 0)
- {
- return LOGIC_1_ITER_SHMEM;
- }
- }
- // Logic (shared-memory, 4 iterations)
- if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
- {
- tester.gpuShortLCG0Shmem(err_count, 4);
- if (err_count > 0)
- {
- return LOGIC_4_ITER_SHMEM;
- }
- }
- }
-
- tester.deallocate();
- return err_count;
-}
-
-/*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a
- * value greater then 0. In case of other errors (e.g. kernel launch errors,
- * device querying errors) -1 is returned.
- *
- * \param[in] dev_id the device id of the GPU or -1 if the device has already been selected
- * \returns 0 if no error was detected, otherwise >0
- */
-int do_quick_memtest(int dev_id)
-{
- cudaDeviceProp dev_prop;
- int devmem, res, time = 0;
-
- if (debug)
- {
- time = getTimeMilliseconds();
- }
-
- if (do_sanity_checks(dev_id, &dev_prop) != 0)
- {
- // something went wrong
- return -1;
- }
-
- if (debug)
- {
- devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
- fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
- QUICK_MEM, devmem, QUICK_ITER);
- }
-
- res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
-
- if (debug)
- {
- fprintf(debug, "Q-RES = %d\n", res);
- fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
- }
-
- /* destroy context only if we created it */
- if (dev_id != -1)
- {
- cudaThreadExit();
- }
- return res;
-}
-
-/*! \brief Runs a full memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a
- * value greater then 0. In case of other errors (e.g. kernel launch errors,
- * device querying errors) -1 is returned.
- *
- * \param[in] dev_id the device id of the GPU or -1 if the device has already been selected
- * \returns 0 if no error was detected, otherwise >0
- */
-
-int do_full_memtest(int dev_id)
-{
- cudaDeviceProp dev_prop;
- int devmem, res, time = 0;
-
- if (debug)
- {
- time = getTimeMilliseconds();
- }
-
- if (do_sanity_checks(dev_id, &dev_prop) != 0)
- {
- // something went wrong
- return -1;
- }
-
- devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-
- if (debug)
- {
- fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
- devmem, devmem, FULL_ITER);
- }
-
- /* do all test on the entire memory */
- res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
-
- if (debug)
- {
- fprintf(debug, "F-RES = %d\n", res);
- fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
- }
-
- /* destroy context only if we created it */
- if (dev_id != -1)
- {
- cudaThreadExit();
- }
- return res;
-}
-
-/*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a value greater
- * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
- * is returned. Note, that test iterations are not interrupted therefor the total runtime of
- * the test will always be multipple of one iteration's runtime.
- *
- * \param[in] dev_id the device id of the GPU or -1 if the device has laredy been selected
- * \param[in] time_constr the time limit of the testing
- * \returns 0 if no error was detected, otherwise >0
- */
-int do_timed_memtest(int dev_id, int time_constr)
-{
- cudaDeviceProp dev_prop;
- int devmem, res = 0, time = 0, startt;
-
- if (debug)
- {
- time = getTimeMilliseconds();
- }
-
- time_constr *= 1000; /* convert to ms for convenience */
- startt = getTimeMilliseconds();
-
- if (do_sanity_checks(dev_id, &dev_prop) != 0)
- {
- // something went wrong
- return -1;
- }
-
- devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-
- if (debug)
- {
- fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
- devmem, devmem, time_constr);
- }
-
- /* do the TIMED_TESTS set, one step at a time on the entire memory
- that can be allocated, and stop when the given time is exceeded */
- while ( ((int)getTimeMilliseconds() - startt) < time_constr)
- {
- res = do_memtest(TIMED_TESTS, devmem, 1);
- if (res != 0)
- {
- break;
- }
- }
-
- if (debug)
- {
- fprintf(debug, "T-RES = %d\n", res);
- fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
- }
-
- /* destroy context only if we created it */
- if (dev_id != -1)
- {
- cudaThreadExit();
- }
- return res;
-}
-
/*! \brief Initializes the GPU with the given index.
*
* The varible \mygpu is the index of the GPU to initialize in the
+++ /dev/null
-/*
- * memtestG80_core.cu
- * MemtestG80 core memory test functions and OOP interface to tester.
- *
- * Author: Imran Haque, 2009
- * Copyright 2009, Stanford University
- *
- * This file is licensed under the terms of the LGPL. Please see
- * the COPYING file in the accompanying source distribution for
- * full license terms.
- *
- */
-
- /*
- * CUDA grid layout: Linear in blocks and threads.
- * Intended usage = 1k blocks, 512 t/blk, with N words (iterations) per thread
- * -> 2*N MiB tested per grid
- * thread address at iteration i = base + blockIdx.x * N * blockDim.x + i*blockDim.x + threadIdx.x
- *
- */
-
-// Naming convention: gpuXXX and cpuXXX functions are user-accessible; deviceXXX functions are internal
-// gpuXXX functions execute a particular test on a block of GPU memory
-// cpuXXX " " " " " " " " " CPU "
-
-#define THREAD_ADDRESS(base,N,i) (base + blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
-#define THREAD_OFFSET(N,i) (blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
-#define BITSDIFF(x,y) __popc((x) ^ (y))
-
-
-#include "memtestG80_core.h"
-
-#include <stdio.h>
-
-
-
-
-void memtestState::deallocate() {
- if (allocated) {
- cudaFree(devTestMem);
- cudaFree(devTempMem);
- free(hostTempMem);
- devTestMem = NULL;
- devTempMem = NULL;
- hostTempMem = NULL;
- allocated = false;
- }
- initTime = 0;
- }
-
-uint memtestState::allocate(uint mbToTest) {
- deallocate();
-
- initTime = getTimeMilliseconds();
-
- // Round up to nearest 2MiB
- if (mbToTest % 2) mbToTest++;
-
- megsToTest = mbToTest;
- loopIters = megsToTest/2;
-
- if (megsToTest == 0) return 0;
-
- try {
- if (cudaMalloc((void**)&devTestMem,megsToTest*1048576UL) != cudaSuccess) throw 1;
- if (cudaMalloc((void**)&devTempMem,sizeof(uint)*nBlocks) != cudaSuccess) throw 2;
- if ( (hostTempMem = (uint*)malloc(sizeof(uint)*nBlocks)) == NULL) throw 3;
- } catch (...) {
- // Clear CUDA error flag for outside world
- cudaGetLastError();
- if (devTempMem) {
- cudaFree(devTempMem);
- devTempMem = NULL;
- }
- if (devTestMem) {
- cudaFree(devTestMem);
- devTestMem = NULL;
- }
- if (hostTempMem) {
- free(hostTempMem);
- hostTempMem = NULL;
- }
- return 0;
- }
- allocated = true;
- return megsToTest;
- }
-bool memtestState::gpuMemoryBandwidth(double& bandwidth,uint mbToTest,uint iters) {
- if (!allocated || megsToTest < 2*mbToTest) return false;
- bandwidth = ::gpuMemoryBandwidth(devTestMem,devTestMem+mbToTest*1048576/4,mbToTest,iters);
- return cudaGetLastError() == cudaSuccess;
-}
-bool memtestState::gpuWriteConstant(const uint constant) const {
- if (!allocated) return false;
- ::gpuWriteConstant(nBlocks,nThreads,devTestMem,loopIters,constant);
- return cudaGetLastError() == cudaSuccess;
-}
-
-bool memtestState::gpuVerifyConstant(uint& errorCount,const uint constant) const {
- if (!allocated) return false;
- errorCount = ::gpuVerifyConstant(nBlocks,nThreads,devTestMem,loopIters,constant,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-
-bool memtestState::gpuShortLCG0(uint& errorCount,const uint repeats) const {
- if (!allocated) return false;
- errorCount = ::gpuShortLCG0(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuShortLCG0Shmem(uint& errorCount,const uint repeats) const {
- if (!allocated) return false;
- errorCount = ::gpuShortLCG0Shmem(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuMovingInversionsOnesZeros(uint& errorCount) const {
- if (!allocated) return false;
- errorCount = ::gpuMovingInversionsOnesZeros(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuWalking8BitM86(uint& errorCount,const uint shift) const {
- if (!allocated) return false;
- errorCount = ::gpuWalking8BitM86(nBlocks,nThreads,devTestMem,loopIters,shift,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuWalking8Bit(uint& errorCount,const bool ones,const uint shift) const {
- if (!allocated) return false;
- errorCount = ::gpuWalking8Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuMovingInversionsRandom(uint& errorCount) const {
- if (!allocated) return false;
- errorCount = ::gpuMovingInversionsRandom(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuWalking32Bit(uint& errorCount,const bool ones,const uint shift) const {
- if (!allocated) return false;
- errorCount = ::gpuWalking32Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuRandomBlocks(uint& errorCount,const uint seed) const {
- if (!allocated) return false;
- errorCount = ::gpuRandomBlocks(nBlocks,nThreads,devTestMem,loopIters,seed,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-bool memtestState::gpuModuloX(uint& errorCount,const uint shift,const uint pattern,const uint modulus,const uint overwriteIters) const {
- if (!allocated) return false;
- errorCount = ::gpuModuloX(nBlocks,nThreads,devTestMem,loopIters,shift,pattern,modulus,overwriteIters,devTempMem,hostTempMem);
- return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
-}
-
-
-
-__global__ void deviceWriteConstant(uint* base, uint N, const uint constant);
-__global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount);
-__global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period);
-__global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period);
-__global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed);
-__global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount);
-__global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift);
-__global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount);
-__global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1);
-__global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount);
-__global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters);
-__global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount);
-
-
-// Utility function to measure memory bandwidth
-__host__ double gpuMemoryBandwidth(uint* src,uint* dst,uint mbToTest,uint iters) {
- uint start = getTimeMilliseconds();
- for (uint i = 0; i < iters; i++) {
- cudaMemcpy(dst,src,mbToTest*1048576,cudaMemcpyDeviceToDevice);
- }
- //D-to-D memory copies are non-blocking, so sync to get correct timing
- cudaThreadSynchronize();
- //SOFTWAIT();
- uint end = getTimeMilliseconds();
-
- // Calculate bandwidth in MiB/s
- // Multiply by 2 since we are reading and writing to the same memory
- double bw = 2.0*((double)mbToTest*iters)/((end-start)/1000.0);
- return bw;
-}
-
-// Utility functions to write/verify pure constants in memory, CPU/GPU {{{
-__host__ void gpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
- deviceWriteConstant<<<nBlocks,nThreads>>>(base,N,constant);
-}
-
-__global__ void deviceWriteConstant(uint* base, uint N, const uint constant) {
- for (uint i = 0 ; i < N; i++) {
- *(THREAD_ADDRESS(base,N,i)) = constant;
- }
-}
-//}}}
-__host__ uint gpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant,uint* blockErrorCount,uint* errorCounts) { //{{{
- // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
-
- deviceVerifyConstant<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,constant,blockErrorCount);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
-
- // Sum-reduce block error counts on the host - it's only order of 1k numbers.
- uint totalErrors = 0;
- for (uint i = 0; i < nBlocks; i++) {
- totalErrors += errorCounts[i];
- }
- return totalErrors;
-}
-
-__global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount) {
- // Verifies memory at base to make sure it has a constant pattern
- // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
- // Sum-reduce this array afterwards to get total error count over tested region
- // Uses 4*blockDim.x bytes of shared memory
-
- extern __shared__ uint threadErrorCount[];
- threadErrorCount[threadIdx.x] = 0;
-
- for (uint i = 0; i < N; i++) {
- //if ( *(THREAD_ADDRESS(base,N,i)) != constant ) threadErrorCount[threadIdx.x]++;
- threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),constant);
- }
- // Parallel-reduce error counts over threads in block
- for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
- __syncthreads();
- if (threadIdx.x < stride)
- threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
- }
- __syncthreads();
-
- if (threadIdx.x == 0)
- blockErrorCount[blockIdx.x] = threadErrorCount[0];
-
- return;
-}
-//}}}
-
- __host__ void cpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
- dim3 blockDim(nThreads,0,0);
- dim3 threadIdx(0,0,0);
- dim3 blockIdx(0,0,0);
- for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
- for (uint i = 0; i < N; i++) {
- for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
- *(THREAD_ADDRESS(base,N,i)) = constant;
- }
- }
- }
-}
-//}}}
-__host__ uint cpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
- dim3 blockDim(nThreads,0,0);
- dim3 threadIdx(0,0,0);
- dim3 blockIdx(0,0,0);
- uint errorCount = 0;
- for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
- for (uint i = 0; i < N; i++) {
- for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
- if (*(THREAD_ADDRESS(base,N,i)) != constant) errorCount++;
- }
- }
- }
- return errorCount;
-}
-//}}}
-//}}}
-
-// Logic test
-// Idea: Run a varying number of iterations (k*N) of a short-period (per=N) LCG that returns to zero (or F's) quickly {{{
-// Store only the result of the last iteration
-// Compare output to the desired constant
-// Compare results between varying k - memory error rate for a given pattern should be constant,
-// so variation should be due to logic errors in loop count
-__host__ uint gpuShortLCG0(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
- deviceShortLCG0<<<nBlocks,nThreads>>>(base,N,repeats,period);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
- return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
-} //}}}
-
-__host__ uint gpuShortLCG0Shmem(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
- deviceShortLCG0Shmem<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,repeats,period);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
- return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
-} //}}}
-
-// Put the LCG loop into a macro so we don't repeat code between versions of logic tester.
-// The paired XOR adds diversity to the instruction stream, and is not reduced to a NOT
-// as a single XOR is (verified with decuda).
-// {{{
-#if defined (LINUX) || defined(OSX)
-#define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
- (var) = ~(var);\
- _Pragma("unroll 1")\
- for (uint iter = 0; iter < period; iter++) {\
- (var) = ~(var);\
- (var) = (a)*(var)+(c);\
- (var) ^= 0xFFFFFFF0;\
- (var) ^= 0xF;\
- }\
- (var) = ~(var);\
-}
-#elif defined (WINDOWS) || defined (WINNV)
-#define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
- (var) = ~(var);\
- __pragma("unroll 1")\
- for (uint iter = 0; iter < period; iter++) {\
- (var) = ~(var);\
- (var) = (a)*(var)+(c);\
- (var) ^= 0xFFFFFFF0;\
- (var) ^= 0xF;\
- }\
- (var) = ~(var);\
-}
-#endif
-//}}}
-
-__global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period) { //{{{
- // Pick a different block for different LCG lengths
- // Short periods are useful if LCG goes inside for i in 0..N loop
- int a,c;
- switch (period) {
- case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
- case 512: a = 0x61c8647f; c = 0x2b3e0000; break;
- case 256: a = 0x7161ac7f; c = 0x43840000; break;
- case 128: a = 0x0432b47f; c = 0x1ce80000; break;
- case 2048: a = 0x763fffff; c = 0x4769466f; break;
- default: a = 0; c = 0; break;
- }
-
- uint value = 0;
- LCGLOOP(value,repeats,period,a,c)
-
- for (uint i = 0 ; i < N; i++) {
- *(THREAD_ADDRESS(base,N,i)) = value;
- }
-} //}}}
-// _shmem version uses shared memory to store inter-iteration values
-// is more sensitive to shared memory errors from (eg) shader overclocking
-__global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period) { //{{{
- // Pick a different block for different LCG lengths
- // Short periods are useful if LCG goes inside for i in 0..N loop
- int a,c;
- extern __shared__ uint shmem[];
- switch (period) {
- case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
- case 512: a = 0x61c8647f; c = 0x2b3e0000; break;
- case 256: a = 0x7161ac7f; c = 0x43840000; break;
- case 128: a = 0x0432b47f; c = 0x1ce80000; break;
- case 2048: a = 0x763fffff; c = 0x4769466f; break;
- default: a = 0; c = 0; break;
- }
- shmem[threadIdx.x] = 0;
- LCGLOOP(shmem[threadIdx.x],repeats,period,a,c)
-
- for (uint i = 0 ; i < N; i++) {
- *(THREAD_ADDRESS(base,N,i)) = shmem[threadIdx.x];
-
- }
-} //}}} //}}}
-
-
-// Memtest86 Test 2: tseq=0,4
-__host__ uint gpuMovingInversionsOnesZeros(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
-
- uint errorCount;
- gpuWriteConstant(nBlocks,nThreads,base,N,0xFFFFFFFF);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,0xFFFFFFFF,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
-
- gpuWriteConstant(nBlocks,nThreads,base,N,0x0);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,0x0,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
- return errorCount;
-} //}}}
-
-// Memtest86 Test 3: tseq=1
-__host__ uint gpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift,uint* blockErrorCounts,uint* errorCounts) { //{{{
- // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
- // written into each 32-bit word in memory, verified, and its complement written and verified
- shift &= 0x7;
- uint pattern = 1 << shift;
- pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
-
- uint errorCount;
- gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
-
- pattern = ~pattern;
- gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
- return errorCount;
-} //}}}
-__host__ uint cpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift) { //{{{
- // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
- // written into each 32-bit word in memory, verified, and its complement written and verified
- shift &= 0x7;
- uint pattern = 1 << shift;
- pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
-
- uint errorCount;
- cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- errorCount = cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
-
- pattern = ~pattern;
- cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- errorCount += cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
-
- return errorCount;
-} //}}}
-__host__ uint gpuWalking8Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
- // Implements one iteration of true walking 8-bit ones/zeros test
- uint patterns[2]={0x0,0x0};
-
- // Build the walking-ones paired pattern of 8-bits with the given shift
- shift &= 0x7;
- uint bits = 0x1 << shift;
- for (uint i = 0; i < 4; i++) {
- patterns[0] = (patterns[0] << 8) | bits;
- bits = (bits == 0x80) ? 0x01 : bits<<1;
- }
- for (uint i = 0; i < 4; i++) {
- patterns[1] = (patterns[1] << 8) | bits;
- bits = (bits == 0x80) ? 0x01 : bits<<1;
- }
-
- if (!ones) {
- patterns[0] = ~patterns[0];
- patterns[1] = ~patterns[1];
- }
-
- //printf("Host Patterns: %08x %08x\n",patterns[0],patterns[1]);
- deviceWritePairedConstants<<<nBlocks,nThreads>>>(base,N,patterns[0],patterns[1]);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
- //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
- //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
- // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
- deviceVerifyPairedConstants<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,patterns[0],patterns[1],blockErrorCount);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
- //if (cudaGetLastError() != cudaSuccess) {
- // return 0xFFFFFFFF; // -1
- //}
- //uint errorCounts[nBlocks];
- cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
-
- // Sum-reduce block error counts on the host - it's only order of 1k numbers.
- uint totalErrors = 0;
- for (uint i = 0; i < nBlocks; i++) {
- totalErrors += errorCounts[i];
- }
- return totalErrors;
-}
-
-__global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1) {
- // Writes paired constants to memory, such that each offset that is X mod 2 receives patterns[X]
- // Used for true walking-ones/zeros 8-bit test
- //if (threadIdx.x == 0)
- // printf("Device Patterns Block %u: %08x %08x\n",blockIdx.x,patterns[0],patterns[1]);
- const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
- //const uint pattern = patterns[threadIdx.x & 0x1];
- for (uint i = 0 ; i < N; i++) {
- *(THREAD_ADDRESS(base,N,i)) = pattern;
- //*(base+blockIdx.x*N*blockDim.x + i*blockDim.x + threadIdx.x) = 0;
- }
-
-}
-
-__global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount) {
- // Verifies memory at base to make sure it has a correct paired-constant pattern
- // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
- // Sum-reduce this array afterwards to get total error count over tested region
- // Uses 4*blockDim.x bytes of shared memory
-
- extern __shared__ uint threadErrorCount[];
- threadErrorCount[threadIdx.x] = 0;
- //const uint pattern = patterns[threadIdx.x & 0x1];
- const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
-
- for (uint i = 0; i < N; i++) {
- //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
- threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
- }
- // Parallel-reduce error counts over threads in block
- for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
- __syncthreads();
- if (threadIdx.x < stride)
- threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
- }
- __syncthreads();
-
- if (threadIdx.x == 0)
- blockErrorCount[blockIdx.x] = threadErrorCount[0];
-
- return;
-}
-//}}}
-
-// Memtest86 Test 4: tseq=10
-__host__ uint gpuMovingInversionsRandom(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
-
- uint errorCount;
-
- uint pattern = (uint)rand();
- gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
-
- pattern = ~pattern;
- gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
- CHECK_LAUNCH_ERROR();
- return errorCount;
-} //}}}
-
-// Memtest86 Test 6: tseq=2
-__host__ uint gpuWalking32Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
- // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
- // Does one iteration of the walking-{ones/zeros} 32-bit test paralleling Memtest
- // With the starting pattern 1<<shift
- // NUMBER OF THREADS SHOULD BE A MULTIPLE OF 32
-
- deviceWriteWalking32Bit<<<nBlocks,nThreads>>>(base,N,ones,shift);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- deviceVerifyWalking32Bit<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,ones,shift,blockErrorCount);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
-
- // Sum-reduce block error counts on the host - it's only order of 1k numbers.
- uint totalErrors = 0;
- for (uint i = 0; i < nBlocks; i++) {
- totalErrors += errorCounts[i];
- }
- return totalErrors;
-
-}
-
-__global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift) {
- // Writes one iteration of the walking-{ones/zeros} 32-bit pattern to gpu memory
-
- // Want to write in a 1 << (offset from base + shift % 32)
- // Since thread indices are aligned with base, this reduces to
- // 1 << ((threadIdx.x+shift) & 0x1f)
- // With conditional inversion for walking zeros
- uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
- pattern = ones ? pattern : ~pattern;
-
- for (uint i = 0; i < N; i++) {
- *(THREAD_ADDRESS(base,N,i)) = pattern;
- }
-}
-
-__global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount) {
- // Verifies memory at base to make sure it has a constant pattern
- // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
- // Sum-reduce this array afterwards to get total error count over tested region
- // Uses 4*blockDim.x bytes of shared memory
-
- extern __shared__ uint threadErrorCount[];
- threadErrorCount[threadIdx.x] = 0;
-
- uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
- pattern = ones ? pattern : ~pattern;
-
- for (uint i = 0; i < N; i++) {
- //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
- threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
- }
- // Parallel-reduce error counts over threads in block
- for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
- __syncthreads();
- if (threadIdx.x < stride)
- threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
- }
- __syncthreads();
-
- if (threadIdx.x == 0)
- blockErrorCount[blockIdx.x] = threadErrorCount[0];
-
- return;
-}
-//}}}
-
-// Memtest86 Test 7: tseq=9
-__host__ uint gpuRandomBlocks(const uint nBlocks,const uint nThreads,uint* base,uint N,uint seed,uint* blockErrorCount,uint* errorCounts) { //{{{ {{{
- // Writes random numbers into memory and verifies pattern
- //uint errorCounts[nBlocks];
-
- deviceWriteRandomBlocks<<<nBlocks,nThreads,4*nThreads>>>(base,N,seed);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
- //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
-
- deviceVerifyRandomBlocks<<<nBlocks,nThreads,12*nThreads>>>(base,N,seed,blockErrorCount);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
-
- cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
-
- // Sum-reduce block error counts on the host - it's only order of 1k numbers.
- uint totalErrors = 0;
- for (uint i = 0; i < nBlocks; i++) {
- totalErrors += errorCounts[i];
- }
- return totalErrors;
-}
-//}}}
-//
-// Math functions modulo the Mersenne prime 2^31 -1 {{{
-__device__ void deviceMul3131 (uint v1, uint v2,uint& LO, uint& HI)
-{
- // Given v1, v2 < 2^31
- // Emulate a 31-bit integer multiply by doing instead a 32-bit multiply into LO and HI
- // And shifting bits around to make it look right.
- LO = v1*v2;
- HI = __umulhi(v1,v2);
- HI <<= 1;
- HI |= (LO & 0x80000000) >> 31;
- LO &= 0x7FFFFFFF;
-
-}
-
-__device__ uint deviceModMP31(uint LO,uint HI) {
- // Modulo a 62-bit number HI<<31 + LO, mod 2^31-1
- // Encyclopedia of Cryptography and Security By Henk C. A. van Tilborg
- // page 381, Mersenne Primes
- uint sum = LO+HI;
- if (sum >= 0x80000000) {
- // If a+b > 2^31, then high bit will be set
- return sum - 0x80000000 + 1;
- } else {
- return sum;
- }
-}
-__device__ uint deviceMulMP31(uint a,uint b) {
- // Multiplies a pair of 31-bit integers a and b mod the Mersenne prime 2^31-1
- // Takes result through a 62-bit intermediate
- uint LO,HI;
- deviceMul3131(a,b,LO,HI);
- return deviceModMP31(LO,HI);
-}
-
-__device__ uint deviceExpoModMP31(uint base,uint exponent) {
- uint result = 1;
- while (exponent > 0) {
- if (exponent & 1) {
- result = deviceMulMP31(result,base);
- }
- exponent >>= 1;
- base = deviceMulMP31(base,base);
- }
- return result;
-}
-//}}}
-// deviceRan0p: Parallelized closed-form version of NR's ran0 {{{
-__device__ uint deviceRan0p(int seed,int n) { //
- uint an = deviceExpoModMP31(16807,n+1);
- return deviceMulMP31(an,seed);
-}
-//}}}
-// deviceIrbit2: random bit generation, from NR {{{
-__device__ int deviceIrbit2(uint& seed) {
- const uint IB1 = 1;
- const uint IB2 = 2;
- const uint IB5 = 16;
- const uint IB18 = 131072;
- const uint MASK = IB1+IB2+IB5;
- if (seed & IB18) {
- seed = ((seed ^ MASK) << 1) | IB1;
- return 1;
- } else {
- seed <<= 1;
- return 0;
- }
-}
-//}}}
-__global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed) { //{{{
- // Requires 4*nThreads bytes of shared memory
- extern __shared__ uint randomBlock[];
-
- // Make sure seed is not zero.
- if (seed == 0) seed = 123459876+blockIdx.x;
- uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
-
- for (uint i=0; i < N; i++) {
- // Generate a block of random numbers in parallel using closed-form expression for ran0
- // OR in a random bit because Ran0 will never have the high bit set
- randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
- __syncthreads();
-
- // Set the seed for the next round to the last number calculated in this round
- seed = randomBlock[blockDim.x-1];
-
- // Blit shmem block out to global memory
- *(THREAD_ADDRESS(base,N,i)) = randomBlock[threadIdx.x];
- }
-}
-//}}}
-__global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount) { //{{{
- // Verifies memory at base to make sure it has a correct random pattern given the seed
- // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
- // Sum-reduce this array afterwards to get total error count over tested region
- // Uses 12*blockDim.x bytes of shared memory
-
- extern __shared__ uint shmem[];
- uint* threadErrorCount = shmem;
- uint* randomBlock = shmem + blockDim.x;
- // Put these into shmem to cut register count
- uint* bitSeeds = randomBlock + blockDim.x;
-
- threadErrorCount[threadIdx.x] = 0;
-
- // Make sure seed is not zero.
- if (seed == 0) seed = 123459876+blockIdx.x;
- //uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
- bitSeeds[threadIdx.x] = deviceRan0p(seed + threadIdx.x,threadIdx.x);
-
- for (uint i = 0; i < N; i++) {
- // Generate a block of random numbers in parallel using closed-form expression for ran0
- // OR in a random bit because Ran0 will never have the high bit set
- //randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
- randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeeds[threadIdx.x]) << 31);
- __syncthreads();
-
- // Set the seed for the next round to the last number calculated in this round
- seed = randomBlock[blockDim.x-1];
-
- //if ( randomBlock[threadIdx.x] != *(THREAD_ADDRESS(base,N,i))) threadErrorCount[threadIdx.x]++;
- threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),randomBlock[threadIdx.x]);
-
- }
-
- // Parallel-reduce error counts over threads in block
- for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
- __syncthreads();
- if (threadIdx.x < stride)
- threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
- }
- __syncthreads();
-
- if (threadIdx.x == 0)
- blockErrorCount[blockIdx.x] = threadErrorCount[0];
-
- return;
-}
-//}}}
-//}}}
-
-// Memtest86 Test 8: tseq=3 (M86 uses modulus = 20)
-__host__ uint gpuModuloX(const uint nBlocks,const uint nThreads,uint* base,const uint N,uint shift,uint pattern1,const uint modulus,const uint iters,
- uint* blockErrorCount,uint* errorCounts) { //{{{
- // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
- // Given a shift, modulus, pattern to test and number of overwrite iterations
- // Performs Modulo-X test on memory
-
- //uint errorCounts[nBlocks];
- uint totalErrors = 0;
- shift %= modulus;
-
- // Test both the given pattern and its inverse
- for (uint i = 0; i < 2; i++, pattern1 = ~pattern1) {
- deviceWritePairedModulo<<<nBlocks,nThreads>>>(base,N,shift,pattern1,~pattern1,modulus,iters);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- deviceVerifyPairedModulo<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,shift,pattern1,modulus,blockErrorCount);
- CHECK_LAUNCH_ERROR();
- SOFTWAIT();
- CHECK_LAUNCH_ERROR();
-
- cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
-
- // Sum-reduce block error counts on the host - it's only order of 1k numbers.
- for (uint i = 0; i < nBlocks; i++) {
- totalErrors += errorCounts[i];
- }
- }
- return totalErrors;
-}
-
-__global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters) {
- // First writes pattern1 into every offset that is 0 mod modulus
- // Next (iters times) writes ~pattern1 into every other address
- uint offset;
- for (uint i = 0 ; i < N; i++) {
- offset = THREAD_OFFSET(N,i);
- if ((offset % modulus) == shift) *(base+offset) = pattern1;
- }
- __syncthreads();
- for (uint j = 0; j < iters; j++) {
- for (uint i = 0 ; i < N; i++) {
- offset = THREAD_OFFSET(N,i);
- if ((offset % modulus) != shift) *(base+offset) = pattern2;
- }
- }
-}
-__global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount) {
- // Verifies that memory at each (offset mod modulus == shift) stores pattern1
- // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
- // Sum-reduce this array afterwards to get total error count over tested region
- // Uses 4*blockDim.x bytes of shared memory
-
- extern __shared__ uint threadErrorCount[];
- threadErrorCount[threadIdx.x] = 0;
- uint offset;
-
- for (uint i = 0; i < N; i++) {
- offset = THREAD_OFFSET(N,i);
- //if (((offset % modulus) == shift) && (*(base+offset) != pattern1)) threadErrorCount[threadIdx.x]++;
- if ((offset % modulus) == shift) threadErrorCount[threadIdx.x] += BITSDIFF(*(base+offset),pattern1);
- }
- // Parallel-reduce error counts over threads in block
- for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
- __syncthreads();
- if (threadIdx.x < stride)
- threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
- }
- __syncthreads();
-
- if (threadIdx.x == 0)
- blockErrorCount[blockIdx.x] = threadErrorCount[0];
-
- return;
-}
-//}}}