src/gromacs/gmxlib/gpu_utils/memtestG80_core.cu

   1 /*
   2  * memtestG80_core.cu
   3  * MemtestG80 core memory test functions and OOP interface to tester.
   4  *
   5  * Author: Imran Haque, 2009
   6  * Copyright 2009, Stanford University
   7  *
   8  * This file is licensed under the terms of the LGPL. Please see
   9  * the COPYING file in the accompanying source distribution for
  10  * full license terms.
  11  *
  12  */
  13
  14  /*
  15   * CUDA grid layout: Linear in blocks and threads.
  16   * Intended usage = 1k blocks, 512 t/blk, with N words (iterations) per thread
  17   *     -> 2*N MiB tested per grid
  18   * thread address at iteration i = base + blockIdx.x * N * blockDim.x + i*blockDim.x + threadIdx.x
  19   *
  20   */
  21
  22 // Naming convention: gpuXXX and cpuXXX functions are user-accessible; deviceXXX functions are internal
  23 //                    gpuXXX functions execute a particular test on a block of GPU memory
  24 //                    cpuXXX "          "      "   "         "    " "  "    "  CPU "
  25
  26 #define THREAD_ADDRESS(base,N,i) (base + blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
  27 #define THREAD_OFFSET(N,i) (blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
  28 #define BITSDIFF(x,y) __popc((x) ^ (y))
  29
  30
  31 #include "memtestG80_core.h"
  32
  33 #include <stdio.h>
  34
  35
  36
  37
  38 void memtestState::deallocate() {
  39                 if (allocated) {
  40                         cudaFree(devTestMem);
  41                         cudaFree(devTempMem);
  42                         free(hostTempMem);
  43                         devTestMem = NULL;
  44                         devTempMem = NULL;
  45                         hostTempMem = NULL;
  46                         allocated = false;
  47                 }
  48         initTime = 0;
  49         }
  50
  51 uint memtestState::allocate(uint mbToTest) {
  52                 deallocate();
  53
  54         initTime = getTimeMilliseconds();
  55
  56         // Round up to nearest 2MiB
  57                 if (mbToTest % 2) mbToTest++;
  58
  59                 megsToTest = mbToTest;
  60                 loopIters = megsToTest/2;
  61
  62                 if (megsToTest == 0) return 0;
  63
  64                 try {
  65                         if (cudaMalloc((void**)&devTestMem,megsToTest*1048576UL) != cudaSuccess) throw 1;
  66                         if (cudaMalloc((void**)&devTempMem,sizeof(uint)*nBlocks) != cudaSuccess) throw 2;
  67                         if ( (hostTempMem = (uint*)malloc(sizeof(uint)*nBlocks)) == NULL) throw 3;
  68                 } catch (...) {
  69             // Clear CUDA error flag for outside world
  70             cudaGetLastError();
  71                         if (devTempMem) {
  72                                 cudaFree(devTempMem);
  73                                 devTempMem = NULL;
  74                         }
  75                         if (devTestMem) {
  76                                 cudaFree(devTestMem);
  77                                 devTestMem = NULL;
  78                         }
  79                         if (hostTempMem) {
  80                                 free(hostTempMem);
  81                                 hostTempMem = NULL;
  82                         }
  83                         return 0;
  84                 }
  85                 allocated = true;
  86                 return megsToTest;
  87         }
  88 bool memtestState::gpuMemoryBandwidth(double& bandwidth,uint mbToTest,uint iters) {
  89     if (!allocated || megsToTest < 2*mbToTest) return false;
  90     bandwidth = ::gpuMemoryBandwidth(devTestMem,devTestMem+mbToTest*1048576/4,mbToTest,iters);
  91     return cudaGetLastError() == cudaSuccess;
  92 }
  93 bool memtestState::gpuWriteConstant(const uint constant) const {
  94         if (!allocated) return false;
  95         ::gpuWriteConstant(nBlocks,nThreads,devTestMem,loopIters,constant);
  96         return cudaGetLastError() == cudaSuccess;
  97 }
  98
  99 bool memtestState::gpuVerifyConstant(uint& errorCount,const uint constant) const {
 100         if (!allocated) return false;
 101         errorCount = ::gpuVerifyConstant(nBlocks,nThreads,devTestMem,loopIters,constant,devTempMem,hostTempMem);
 102         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 103 }
 104
 105 bool memtestState::gpuShortLCG0(uint& errorCount,const uint repeats) const {
 106         if (!allocated) return false;
 107         errorCount = ::gpuShortLCG0(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
 108         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 109 }
 110 bool memtestState::gpuShortLCG0Shmem(uint& errorCount,const uint repeats) const {
 111         if (!allocated) return false;
 112         errorCount = ::gpuShortLCG0Shmem(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
 113         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 114 }
 115 bool memtestState::gpuMovingInversionsOnesZeros(uint& errorCount) const {
 116         if (!allocated) return false;
 117         errorCount = ::gpuMovingInversionsOnesZeros(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
 118         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 119 }
 120 bool memtestState::gpuWalking8BitM86(uint& errorCount,const uint shift) const {
 121         if (!allocated) return false;
 122         errorCount = ::gpuWalking8BitM86(nBlocks,nThreads,devTestMem,loopIters,shift,devTempMem,hostTempMem);
 123         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 124 }
 125 bool memtestState::gpuWalking8Bit(uint& errorCount,const bool ones,const uint shift) const {
 126         if (!allocated) return false;
 127         errorCount = ::gpuWalking8Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
 128         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 129 }
 130 bool memtestState::gpuMovingInversionsRandom(uint& errorCount) const {
 131         if (!allocated) return false;
 132         errorCount = ::gpuMovingInversionsRandom(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
 133         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 134 }
 135 bool memtestState::gpuWalking32Bit(uint& errorCount,const bool ones,const uint shift) const {
 136         if (!allocated) return false;
 137         errorCount = ::gpuWalking32Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
 138         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 139 }
 140 bool memtestState::gpuRandomBlocks(uint& errorCount,const uint seed) const {
 141         if (!allocated) return false;
 142         errorCount = ::gpuRandomBlocks(nBlocks,nThreads,devTestMem,loopIters,seed,devTempMem,hostTempMem);
 143         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 144 }
 145 bool memtestState::gpuModuloX(uint& errorCount,const uint shift,const uint pattern,const uint modulus,const uint overwriteIters) const {
 146         if (!allocated) return false;
 147         errorCount = ::gpuModuloX(nBlocks,nThreads,devTestMem,loopIters,shift,pattern,modulus,overwriteIters,devTempMem,hostTempMem);
 148         return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
 149 }
 150
 151
 152
 153 __global__ void deviceWriteConstant(uint* base, uint N, const uint constant);
 154 __global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount);
 155 __global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period);
 156 __global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period);
 157 __global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed);
 158 __global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount);
 159 __global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift);
 160 __global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount);
 161 __global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1);
 162 __global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount);
 163 __global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters);
 164 __global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount);
 165
 166
 167 // Utility function to measure memory bandwidth
 168 __host__ double gpuMemoryBandwidth(uint* src,uint* dst,uint mbToTest,uint iters) {
 169        uint start = getTimeMilliseconds();
 170            for (uint i = 0; i < iters; i++) {
 171            cudaMemcpy(dst,src,mbToTest*1048576,cudaMemcpyDeviceToDevice);
 172        }
 173        //D-to-D memory copies are non-blocking, so sync to get correct timing
 174        cudaThreadSynchronize();
 175        //SOFTWAIT();
 176        uint end = getTimeMilliseconds();
 177
 178        // Calculate bandwidth in MiB/s
 179            // Multiply by 2 since we are reading and writing to the same memory
 180        double bw = 2.0*((double)mbToTest*iters)/((end-start)/1000.0);
 181            return bw;
 182 }
 183
 184 // Utility functions to write/verify pure constants in memory, CPU/GPU {{{
 185 __host__ void gpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
 186     deviceWriteConstant<<<nBlocks,nThreads>>>(base,N,constant);
 187 }
 188
 189 __global__ void deviceWriteConstant(uint* base, uint N, const uint constant) {
 190     for (uint i = 0 ; i < N; i++) {
 191         *(THREAD_ADDRESS(base,N,i)) = constant;
 192     }
 193 }
 194 //}}}
 195 __host__ uint gpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant,uint* blockErrorCount,uint* errorCounts) { //{{{
 196     // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
 197
 198         deviceVerifyConstant<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,constant,blockErrorCount);
 199         CHECK_LAUNCH_ERROR();
 200     SOFTWAIT();
 201         CHECK_LAUNCH_ERROR();
 202
 203     cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 204
 205     // Sum-reduce block error counts on the host - it's only order of 1k numbers.
 206     uint totalErrors = 0;
 207     for (uint i = 0; i < nBlocks; i++) {
 208         totalErrors += errorCounts[i];
 209     }
 210     return totalErrors;
 211 }
 212
 213 __global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount) {
 214     // Verifies memory at base to make sure it has a constant pattern
 215     // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
 216     // Sum-reduce this array afterwards to get total error count over tested region
 217     // Uses 4*blockDim.x bytes of shared memory
 218
 219     extern __shared__ uint threadErrorCount[];
 220     threadErrorCount[threadIdx.x] = 0;
 221
 222     for (uint i = 0; i < N; i++) {
 223         //if ( *(THREAD_ADDRESS(base,N,i)) != constant ) threadErrorCount[threadIdx.x]++;
 224         threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),constant);
 225     }
 226     // Parallel-reduce error counts over threads in block
 227     for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
 228         __syncthreads();
 229         if (threadIdx.x < stride)
 230             threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
 231     }
 232     __syncthreads();
 233
 234     if (threadIdx.x == 0)
 235         blockErrorCount[blockIdx.x] = threadErrorCount[0];
 236
 237     return;
 238 }
 239 //}}}
 240
 241  __host__ void cpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
 242     dim3 blockDim(nThreads,0,0);
 243     dim3 threadIdx(0,0,0);
 244     dim3 blockIdx(0,0,0);
 245     for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
 246         for (uint i = 0; i < N; i++) {
 247             for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
 248                 *(THREAD_ADDRESS(base,N,i)) = constant;
 249             }
 250         }
 251     }
 252 }
 253 //}}}
 254 __host__ uint cpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
 255     dim3 blockDim(nThreads,0,0);
 256     dim3 threadIdx(0,0,0);
 257     dim3 blockIdx(0,0,0);
 258     uint errorCount = 0;
 259     for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
 260         for (uint i = 0; i < N; i++) {
 261             for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
 262                 if (*(THREAD_ADDRESS(base,N,i)) != constant) errorCount++;
 263             }
 264         }
 265     }
 266     return errorCount;
 267 }
 268 //}}}
 269 //}}}
 270
 271 // Logic test
 272 // Idea: Run a varying number of iterations (k*N) of a short-period (per=N) LCG that returns to zero (or F's) quickly {{{
 273 // Store only the result of the last iteration
 274 // Compare output to the desired constant
 275 // Compare results between varying k - memory error rate for a given pattern should be constant,
 276 //                                     so variation should be due to logic errors in loop count
 277 __host__ uint gpuShortLCG0(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
 278     deviceShortLCG0<<<nBlocks,nThreads>>>(base,N,repeats,period);
 279     CHECK_LAUNCH_ERROR();
 280     SOFTWAIT();
 281     CHECK_LAUNCH_ERROR();
 282     return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
 283 } //}}}
 284
 285 __host__ uint gpuShortLCG0Shmem(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
 286     deviceShortLCG0Shmem<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,repeats,period);
 287     CHECK_LAUNCH_ERROR();
 288     SOFTWAIT();
 289     CHECK_LAUNCH_ERROR();
 290     return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
 291 } //}}}
 292
 293 // Put the LCG loop into a macro so we don't repeat code between versions of logic tester.
 294 // The paired XOR adds diversity to the instruction stream, and is not reduced to a NOT
 295 // as a single XOR is (verified with decuda).
 296 // {{{
 297 #if defined (LINUX) || defined(OSX)
 298 #define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
 299     (var) = ~(var);\
 300     _Pragma("unroll 1")\
 301     for (uint iter = 0; iter < period; iter++) {\
 302         (var) = ~(var);\
 303         (var) = (a)*(var)+(c);\
 304         (var) ^= 0xFFFFFFF0;\
 305         (var) ^= 0xF;\
 306     }\
 307     (var) = ~(var);\
 308 }
 309 #elif defined (WINDOWS) || defined (WINNV)
 310 #define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
 311     (var) = ~(var);\
 312     __pragma("unroll 1")\
 313     for (uint iter = 0; iter < period; iter++) {\
 314         (var) = ~(var);\
 315         (var) = (a)*(var)+(c);\
 316         (var) ^= 0xFFFFFFF0;\
 317         (var) ^= 0xF;\
 318     }\
 319     (var) = ~(var);\
 320 }
 321 #endif
 322 //}}}
 323
 324 __global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period) { //{{{
 325     // Pick a different block for different LCG lengths
 326     // Short periods are useful if LCG goes inside for i in 0..N loop
 327     int a,c;
 328     switch (period) {
 329         case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
 330         case 512:  a = 0x61c8647f; c = 0x2b3e0000; break;
 331         case 256:  a = 0x7161ac7f; c = 0x43840000; break;
 332         case 128:  a = 0x0432b47f; c = 0x1ce80000; break;
 333         case 2048: a = 0x763fffff; c = 0x4769466f; break;
 334         default:   a = 0; c = 0; break;
 335     }
 336
 337     uint value = 0;
 338     LCGLOOP(value,repeats,period,a,c)
 339
 340     for (uint i = 0 ; i < N; i++) {
 341         *(THREAD_ADDRESS(base,N,i)) = value;
 342     }
 343 } //}}}
 344 // _shmem version uses shared memory to store inter-iteration values
 345 // is more sensitive to shared memory errors from (eg) shader overclocking
 346 __global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period) { //{{{
 347     // Pick a different block for different LCG lengths
 348     // Short periods are useful if LCG goes inside for i in 0..N loop
 349     int a,c;
 350     extern __shared__ uint shmem[];
 351     switch (period) {
 352         case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
 353         case 512:  a = 0x61c8647f; c = 0x2b3e0000; break;
 354         case 256:  a = 0x7161ac7f; c = 0x43840000; break;
 355         case 128:  a = 0x0432b47f; c = 0x1ce80000; break;
 356         case 2048: a = 0x763fffff; c = 0x4769466f; break;
 357         default:   a = 0; c = 0; break;
 358     }
 359     shmem[threadIdx.x] = 0;
 360     LCGLOOP(shmem[threadIdx.x],repeats,period,a,c)
 361
 362     for (uint i = 0 ; i < N; i++) {
 363         *(THREAD_ADDRESS(base,N,i)) = shmem[threadIdx.x];
 364
 365     }
 366 } //}}} //}}}
 367
 368
 369 // Memtest86 Test 2: tseq=0,4
 370 __host__ uint gpuMovingInversionsOnesZeros(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
 371
 372     uint errorCount;
 373     gpuWriteConstant(nBlocks,nThreads,base,N,0xFFFFFFFF);
 374     CHECK_LAUNCH_ERROR();
 375     SOFTWAIT();
 376         CHECK_LAUNCH_ERROR();
 377
 378         errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,0xFFFFFFFF,blockErrorCounts,errorCounts);
 379         CHECK_LAUNCH_ERROR();
 380
 381         gpuWriteConstant(nBlocks,nThreads,base,N,0x0);
 382     CHECK_LAUNCH_ERROR();
 383     SOFTWAIT();
 384         CHECK_LAUNCH_ERROR();
 385
 386         errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,0x0,blockErrorCounts,errorCounts);
 387         CHECK_LAUNCH_ERROR();
 388     return errorCount;
 389 } //}}}
 390
 391 // Memtest86 Test 3: tseq=1
 392 __host__ uint gpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift,uint* blockErrorCounts,uint* errorCounts) { //{{{
 393     // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
 394     // written into each 32-bit word in memory, verified, and its complement written and verified
 395     shift &= 0x7;
 396     uint pattern = 1 << shift;
 397     pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
 398
 399     uint errorCount;
 400     gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 401     CHECK_LAUNCH_ERROR();
 402     SOFTWAIT();
 403         CHECK_LAUNCH_ERROR();
 404
 405         errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
 406         CHECK_LAUNCH_ERROR();
 407
 408         pattern = ~pattern;
 409     gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 410     CHECK_LAUNCH_ERROR();
 411     SOFTWAIT();
 412         CHECK_LAUNCH_ERROR();
 413
 414         errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
 415         CHECK_LAUNCH_ERROR();
 416     return errorCount;
 417 } //}}}
 418 __host__ uint cpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift) { //{{{
 419     // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
 420     // written into each 32-bit word in memory, verified, and its complement written and verified
 421     shift &= 0x7;
 422     uint pattern = 1 << shift;
 423     pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
 424
 425     uint errorCount;
 426     cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 427     errorCount = cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
 428
 429     pattern = ~pattern;
 430     cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 431     errorCount += cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
 432
 433     return errorCount;
 434 } //}}}
 435 __host__ uint gpuWalking8Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
 436     // Implements one iteration of true walking 8-bit ones/zeros test
 437     uint patterns[2]={0x0,0x0};
 438
 439     // Build the walking-ones paired pattern of 8-bits with the given shift
 440     shift &= 0x7;
 441     uint bits = 0x1 << shift;
 442     for (uint i = 0; i < 4; i++) {
 443         patterns[0] = (patterns[0] << 8) | bits;
 444         bits = (bits == 0x80) ? 0x01 : bits<<1;
 445     }
 446     for (uint i = 0; i < 4; i++) {
 447         patterns[1] = (patterns[1] << 8) | bits;
 448         bits = (bits == 0x80) ? 0x01 : bits<<1;
 449     }
 450
 451     if (!ones) {
 452         patterns[0] = ~patterns[0];
 453         patterns[1] = ~patterns[1];
 454     }
 455
 456         //printf("Host Patterns: %08x %08x\n",patterns[0],patterns[1]);
 457     deviceWritePairedConstants<<<nBlocks,nThreads>>>(base,N,patterns[0],patterns[1]);
 458     CHECK_LAUNCH_ERROR();
 459     SOFTWAIT();
 460         CHECK_LAUNCH_ERROR();
 461         //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 462     //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
 463     // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
 464     deviceVerifyPairedConstants<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,patterns[0],patterns[1],blockErrorCount);
 465     CHECK_LAUNCH_ERROR();
 466     SOFTWAIT();
 467         CHECK_LAUNCH_ERROR();
 468     //if (cudaGetLastError() != cudaSuccess) {
 469         //      return 0xFFFFFFFF; // -1
 470         //}
 471         //uint errorCounts[nBlocks];
 472     cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 473
 474     // Sum-reduce block error counts on the host - it's only order of 1k numbers.
 475     uint totalErrors = 0;
 476     for (uint i = 0; i < nBlocks; i++) {
 477         totalErrors += errorCounts[i];
 478     }
 479     return totalErrors;
 480 }
 481
 482 __global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1) {
 483     // Writes paired constants to memory, such that each offset that is X mod 2 receives patterns[X]
 484     // Used for true walking-ones/zeros 8-bit test
 485     //if (threadIdx.x == 0)
 486     //    printf("Device Patterns Block %u: %08x %08x\n",blockIdx.x,patterns[0],patterns[1]);
 487     const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
 488     //const uint pattern = patterns[threadIdx.x & 0x1];
 489     for (uint i = 0 ; i < N; i++) {
 490         *(THREAD_ADDRESS(base,N,i)) = pattern;
 491         //*(base+blockIdx.x*N*blockDim.x + i*blockDim.x + threadIdx.x) = 0;
 492     }
 493
 494 }
 495
 496 __global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount) {
 497     // Verifies memory at base to make sure it has a correct paired-constant pattern
 498     // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
 499     // Sum-reduce this array afterwards to get total error count over tested region
 500     // Uses 4*blockDim.x bytes of shared memory
 501
 502     extern __shared__ uint threadErrorCount[];
 503     threadErrorCount[threadIdx.x] = 0;
 504     //const uint pattern = patterns[threadIdx.x & 0x1];
 505     const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
 506
 507     for (uint i = 0; i < N; i++) {
 508         //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
 509         threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
 510     }
 511     // Parallel-reduce error counts over threads in block
 512     for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
 513         __syncthreads();
 514         if (threadIdx.x < stride)
 515             threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
 516     }
 517     __syncthreads();
 518
 519     if (threadIdx.x == 0)
 520         blockErrorCount[blockIdx.x] = threadErrorCount[0];
 521
 522     return;
 523 }
 524 //}}}
 525
 526 // Memtest86 Test 4: tseq=10
 527 __host__ uint gpuMovingInversionsRandom(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
 528
 529     uint errorCount;
 530
 531     uint pattern = (uint)rand();
 532     gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 533     CHECK_LAUNCH_ERROR();
 534     SOFTWAIT();
 535         CHECK_LAUNCH_ERROR();
 536
 537         errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
 538         CHECK_LAUNCH_ERROR();
 539
 540         pattern = ~pattern;
 541     gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
 542     CHECK_LAUNCH_ERROR();
 543     SOFTWAIT();
 544         CHECK_LAUNCH_ERROR();
 545
 546         errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
 547         CHECK_LAUNCH_ERROR();
 548     return errorCount;
 549 } //}}}
 550
 551 // Memtest86 Test 6: tseq=2
 552 __host__ uint gpuWalking32Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
 553     // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
 554     // Does one iteration of the walking-{ones/zeros} 32-bit test paralleling Memtest
 555     // With the starting pattern 1<<shift
 556     // NUMBER OF THREADS SHOULD BE A MULTIPLE OF 32
 557
 558     deviceWriteWalking32Bit<<<nBlocks,nThreads>>>(base,N,ones,shift);
 559     CHECK_LAUNCH_ERROR();
 560     SOFTWAIT();
 561         CHECK_LAUNCH_ERROR();
 562
 563         deviceVerifyWalking32Bit<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,ones,shift,blockErrorCount);
 564     CHECK_LAUNCH_ERROR();
 565     SOFTWAIT();
 566         CHECK_LAUNCH_ERROR();
 567
 568     cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 569
 570     // Sum-reduce block error counts on the host - it's only order of 1k numbers.
 571     uint totalErrors = 0;
 572     for (uint i = 0; i < nBlocks; i++) {
 573         totalErrors += errorCounts[i];
 574     }
 575     return totalErrors;
 576
 577 }
 578
 579 __global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift) {
 580     // Writes one iteration of the walking-{ones/zeros} 32-bit pattern to gpu memory
 581
 582     // Want to write in a 1 << (offset from base + shift % 32)
 583     // Since thread indices are aligned with base, this reduces to
 584     // 1 << ((threadIdx.x+shift) & 0x1f)
 585     // With conditional inversion for walking zeros
 586     uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
 587     pattern = ones ? pattern : ~pattern;
 588
 589     for (uint i = 0; i < N; i++) {
 590         *(THREAD_ADDRESS(base,N,i)) = pattern;
 591     }
 592 }
 593
 594 __global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount) {
 595     // Verifies memory at base to make sure it has a constant pattern
 596     // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
 597     // Sum-reduce this array afterwards to get total error count over tested region
 598     // Uses 4*blockDim.x bytes of shared memory
 599
 600     extern __shared__ uint threadErrorCount[];
 601     threadErrorCount[threadIdx.x] = 0;
 602
 603     uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
 604     pattern = ones ? pattern : ~pattern;
 605
 606     for (uint i = 0; i < N; i++) {
 607         //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
 608         threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
 609     }
 610     // Parallel-reduce error counts over threads in block
 611     for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
 612         __syncthreads();
 613         if (threadIdx.x < stride)
 614             threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
 615     }
 616     __syncthreads();
 617
 618     if (threadIdx.x == 0)
 619         blockErrorCount[blockIdx.x] = threadErrorCount[0];
 620
 621     return;
 622 }
 623 //}}}
 624
 625 // Memtest86 Test 7: tseq=9
 626 __host__ uint gpuRandomBlocks(const uint nBlocks,const uint nThreads,uint* base,uint N,uint seed,uint* blockErrorCount,uint* errorCounts) { //{{{ {{{
 627     // Writes random numbers into memory and verifies pattern
 628     //uint errorCounts[nBlocks];
 629
 630     deviceWriteRandomBlocks<<<nBlocks,nThreads,4*nThreads>>>(base,N,seed);
 631     CHECK_LAUNCH_ERROR();
 632     SOFTWAIT();
 633         CHECK_LAUNCH_ERROR();
 634
 635         //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 636     //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
 637
 638         deviceVerifyRandomBlocks<<<nBlocks,nThreads,12*nThreads>>>(base,N,seed,blockErrorCount);
 639     CHECK_LAUNCH_ERROR();
 640     SOFTWAIT();
 641         CHECK_LAUNCH_ERROR();
 642
 643
 644     cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 645
 646     // Sum-reduce block error counts on the host - it's only order of 1k numbers.
 647     uint totalErrors = 0;
 648     for (uint i = 0; i < nBlocks; i++) {
 649         totalErrors += errorCounts[i];
 650     }
 651     return totalErrors;
 652 }
 653 //}}}
 654 //
 655 // Math functions modulo the Mersenne prime 2^31 -1 {{{
 656 __device__ void deviceMul3131 (uint v1, uint v2,uint& LO, uint& HI)
 657 {
 658     // Given v1, v2 < 2^31
 659     // Emulate a 31-bit integer multiply by doing instead a 32-bit multiply into LO and HI
 660     // And shifting bits around to make it look right.
 661     LO = v1*v2;
 662     HI = __umulhi(v1,v2);
 663     HI <<= 1;
 664     HI |= (LO & 0x80000000) >> 31;
 665     LO &= 0x7FFFFFFF;
 666
 667 }
 668
 669 __device__ uint deviceModMP31(uint LO,uint HI) {
 670     // Modulo a 62-bit number HI<<31 + LO, mod 2^31-1
 671     // Encyclopedia of Cryptography and Security By Henk C. A. van Tilborg
 672     // page 381, Mersenne Primes
 673     uint sum = LO+HI;
 674     if (sum >= 0x80000000) {
 675         // If a+b > 2^31, then high bit will be set
 676         return sum - 0x80000000 + 1;
 677     } else {
 678         return sum;
 679     }
 680 }
 681 __device__ uint deviceMulMP31(uint a,uint b) {
 682     // Multiplies a pair of 31-bit integers a and b mod the Mersenne prime 2^31-1
 683     // Takes result through a 62-bit intermediate
 684     uint LO,HI;
 685     deviceMul3131(a,b,LO,HI);
 686     return deviceModMP31(LO,HI);
 687 }
 688
 689 __device__ uint deviceExpoModMP31(uint base,uint exponent) {
 690     uint result = 1;
 691     while (exponent > 0) {
 692         if (exponent & 1) {
 693             result = deviceMulMP31(result,base);
 694         }
 695         exponent >>= 1;
 696         base = deviceMulMP31(base,base);
 697     }
 698     return result;
 699 }
 700 //}}}
 701 // deviceRan0p: Parallelized closed-form version of NR's ran0  {{{
 702 __device__ uint deviceRan0p(int seed,int n) { //
 703     uint an = deviceExpoModMP31(16807,n+1);
 704     return deviceMulMP31(an,seed);
 705 }
 706 //}}}
 707 // deviceIrbit2: random bit generation, from NR {{{
 708 __device__ int deviceIrbit2(uint& seed) {
 709     const uint IB1  = 1;
 710     const uint IB2  = 2;
 711     const uint IB5  = 16;
 712     const uint IB18 = 131072;
 713     const uint MASK = IB1+IB2+IB5;
 714     if (seed & IB18) {
 715         seed = ((seed ^ MASK) << 1) | IB1;
 716         return 1;
 717     } else {
 718         seed <<= 1;
 719         return 0;
 720     }
 721 }
 722 //}}}
 723 __global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed) { //{{{
 724     // Requires 4*nThreads bytes of shared memory
 725     extern __shared__ uint randomBlock[];
 726
 727     // Make sure seed is not zero.
 728     if (seed == 0) seed = 123459876+blockIdx.x;
 729     uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
 730
 731     for (uint i=0; i < N; i++) {
 732         // Generate a block of random numbers in parallel using closed-form expression for ran0
 733         // OR in a random bit because Ran0 will never have the high bit set
 734         randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
 735         __syncthreads();
 736
 737         // Set the seed for the next round to the last number calculated in this round
 738         seed = randomBlock[blockDim.x-1];
 739
 740         // Blit shmem block out to global memory
 741         *(THREAD_ADDRESS(base,N,i)) = randomBlock[threadIdx.x];
 742     }
 743 }
 744 //}}}
 745 __global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount) { //{{{
 746     // Verifies memory at base to make sure it has a correct random pattern given the seed
 747     // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
 748     // Sum-reduce this array afterwards to get total error count over tested region
 749     // Uses 12*blockDim.x bytes of shared memory
 750
 751     extern __shared__ uint shmem[];
 752     uint* threadErrorCount = shmem;
 753     uint* randomBlock = shmem + blockDim.x;
 754     // Put these into shmem to cut register count
 755     uint* bitSeeds = randomBlock + blockDim.x;
 756
 757     threadErrorCount[threadIdx.x] = 0;
 758
 759     // Make sure seed is not zero.
 760     if (seed == 0) seed = 123459876+blockIdx.x;
 761     //uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
 762     bitSeeds[threadIdx.x] = deviceRan0p(seed + threadIdx.x,threadIdx.x);
 763
 764     for (uint i = 0; i < N; i++) {
 765         // Generate a block of random numbers in parallel using closed-form expression for ran0
 766         // OR in a random bit because Ran0 will never have the high bit set
 767         //randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
 768         randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeeds[threadIdx.x]) << 31);
 769         __syncthreads();
 770
 771         // Set the seed for the next round to the last number calculated in this round
 772         seed = randomBlock[blockDim.x-1];
 773
 774         //if ( randomBlock[threadIdx.x] != *(THREAD_ADDRESS(base,N,i))) threadErrorCount[threadIdx.x]++;
 775         threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),randomBlock[threadIdx.x]);
 776
 777     }
 778
 779     // Parallel-reduce error counts over threads in block
 780     for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
 781         __syncthreads();
 782         if (threadIdx.x < stride)
 783             threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
 784     }
 785     __syncthreads();
 786
 787     if (threadIdx.x == 0)
 788         blockErrorCount[blockIdx.x] = threadErrorCount[0];
 789
 790     return;
 791 }
 792 //}}}
 793 //}}}
 794
 795 // Memtest86 Test 8: tseq=3 (M86 uses modulus = 20)
 796 __host__ uint gpuModuloX(const uint nBlocks,const uint nThreads,uint* base,const uint N,uint shift,uint pattern1,const uint modulus,const uint iters,
 797                                                  uint* blockErrorCount,uint* errorCounts) { //{{{
 798     // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
 799     // Given a shift, modulus, pattern to test and number of overwrite iterations
 800     // Performs Modulo-X test on memory
 801
 802     //uint errorCounts[nBlocks];
 803     uint totalErrors = 0;
 804     shift %= modulus;
 805
 806     // Test both the given pattern and its inverse
 807     for (uint i = 0; i < 2; i++, pattern1 = ~pattern1) {
 808         deviceWritePairedModulo<<<nBlocks,nThreads>>>(base,N,shift,pattern1,~pattern1,modulus,iters);
 809             CHECK_LAUNCH_ERROR();
 810         SOFTWAIT();
 811             CHECK_LAUNCH_ERROR();
 812
 813                 deviceVerifyPairedModulo<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,shift,pattern1,modulus,blockErrorCount);
 814                 CHECK_LAUNCH_ERROR();
 815         SOFTWAIT();
 816             CHECK_LAUNCH_ERROR();
 817
 818         cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
 819
 820         // Sum-reduce block error counts on the host - it's only order of 1k numbers.
 821         for (uint i = 0; i < nBlocks; i++) {
 822             totalErrors += errorCounts[i];
 823         }
 824     }
 825     return totalErrors;
 826 }
 827
 828 __global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters) {
 829     // First writes pattern1 into every offset that is 0 mod modulus
 830     // Next  (iters times) writes ~pattern1 into every other address
 831     uint offset;
 832     for (uint i = 0 ; i < N; i++) {
 833         offset = THREAD_OFFSET(N,i);
 834         if ((offset % modulus) == shift) *(base+offset) = pattern1;
 835     }
 836     __syncthreads();
 837     for (uint j = 0; j < iters; j++) {
 838         for (uint i = 0 ; i < N; i++) {
 839             offset = THREAD_OFFSET(N,i);
 840             if ((offset % modulus) != shift) *(base+offset) = pattern2;
 841         }
 842     }
 843 }
 844 __global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount) {
 845     // Verifies that memory at each (offset mod modulus == shift) stores pattern1
 846     // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
 847     // Sum-reduce this array afterwards to get total error count over tested region
 848     // Uses 4*blockDim.x bytes of shared memory
 849
 850     extern __shared__ uint threadErrorCount[];
 851     threadErrorCount[threadIdx.x] = 0;
 852     uint offset;
 853
 854     for (uint i = 0; i < N; i++) {
 855         offset = THREAD_OFFSET(N,i);
 856         //if (((offset % modulus) == shift) && (*(base+offset) != pattern1)) threadErrorCount[threadIdx.x]++;
 857         if ((offset % modulus) == shift) threadErrorCount[threadIdx.x] += BITSDIFF(*(base+offset),pattern1);
 858     }
 859     // Parallel-reduce error counts over threads in block
 860     for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
 861         __syncthreads();
 862         if (threadIdx.x < stride)
 863             threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
 864     }
 865     __syncthreads();
 866
 867     if (threadIdx.x == 0)
 868         blockErrorCount[blockIdx.x] = threadErrorCount[0];
 869
 870     return;
 871 }
 872 //}}}