3 * MemtestG80 core memory test functions and OOP interface to tester.
5 * Author: Imran Haque, 2009
6 * Copyright 2009, Stanford University
8 * This file is licensed under the terms of the LGPL. Please see
9 * the COPYING file in the accompanying source distribution for
15 * CUDA grid layout: Linear in blocks and threads.
16 * Intended usage = 1k blocks, 512 t/blk, with N words (iterations) per thread
17 * -> 2*N MiB tested per grid
18 * thread address at iteration i = base + blockIdx.x * N * blockDim.x + i*blockDim.x + threadIdx.x
22 // Naming convention: gpuXXX and cpuXXX functions are user-accessible; deviceXXX functions are internal
23 // gpuXXX functions execute a particular test on a block of GPU memory
24 // cpuXXX " " " " " " " " " CPU "
26 #define THREAD_ADDRESS(base,N,i) (base + blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
27 #define THREAD_OFFSET(N,i) (blockIdx.x * N * blockDim.x + i * blockDim.x + threadIdx.x)
28 #define BITSDIFF(x,y) __popc((x) ^ (y))
31 #include "memtestG80_core.h"
38 void memtestState::deallocate() {
51 uint memtestState::allocate(uint mbToTest) {
54 initTime = getTimeMilliseconds();
56 // Round up to nearest 2MiB
57 if (mbToTest % 2) mbToTest++;
59 megsToTest = mbToTest;
60 loopIters = megsToTest/2;
62 if (megsToTest == 0) return 0;
65 if (cudaMalloc((void**)&devTestMem,megsToTest*1048576UL) != cudaSuccess) throw 1;
66 if (cudaMalloc((void**)&devTempMem,sizeof(uint)*nBlocks) != cudaSuccess) throw 2;
67 if ( (hostTempMem = (uint*)malloc(sizeof(uint)*nBlocks)) == NULL) throw 3;
69 // Clear CUDA error flag for outside world
88 bool memtestState::gpuMemoryBandwidth(double& bandwidth,uint mbToTest,uint iters) {
89 if (!allocated || megsToTest < 2*mbToTest) return false;
90 bandwidth = ::gpuMemoryBandwidth(devTestMem,devTestMem+mbToTest*1048576/4,mbToTest,iters);
91 return cudaGetLastError() == cudaSuccess;
93 bool memtestState::gpuWriteConstant(const uint constant) const {
94 if (!allocated) return false;
95 ::gpuWriteConstant(nBlocks,nThreads,devTestMem,loopIters,constant);
96 return cudaGetLastError() == cudaSuccess;
99 bool memtestState::gpuVerifyConstant(uint& errorCount,const uint constant) const {
100 if (!allocated) return false;
101 errorCount = ::gpuVerifyConstant(nBlocks,nThreads,devTestMem,loopIters,constant,devTempMem,hostTempMem);
102 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
105 bool memtestState::gpuShortLCG0(uint& errorCount,const uint repeats) const {
106 if (!allocated) return false;
107 errorCount = ::gpuShortLCG0(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
108 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
110 bool memtestState::gpuShortLCG0Shmem(uint& errorCount,const uint repeats) const {
111 if (!allocated) return false;
112 errorCount = ::gpuShortLCG0Shmem(nBlocks,nThreads,devTestMem,loopIters,repeats,lcgPeriod,devTempMem,hostTempMem);
113 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
115 bool memtestState::gpuMovingInversionsOnesZeros(uint& errorCount) const {
116 if (!allocated) return false;
117 errorCount = ::gpuMovingInversionsOnesZeros(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
118 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
120 bool memtestState::gpuWalking8BitM86(uint& errorCount,const uint shift) const {
121 if (!allocated) return false;
122 errorCount = ::gpuWalking8BitM86(nBlocks,nThreads,devTestMem,loopIters,shift,devTempMem,hostTempMem);
123 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
125 bool memtestState::gpuWalking8Bit(uint& errorCount,const bool ones,const uint shift) const {
126 if (!allocated) return false;
127 errorCount = ::gpuWalking8Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
128 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
130 bool memtestState::gpuMovingInversionsRandom(uint& errorCount) const {
131 if (!allocated) return false;
132 errorCount = ::gpuMovingInversionsRandom(nBlocks,nThreads,devTestMem,loopIters,devTempMem,hostTempMem);
133 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
135 bool memtestState::gpuWalking32Bit(uint& errorCount,const bool ones,const uint shift) const {
136 if (!allocated) return false;
137 errorCount = ::gpuWalking32Bit(nBlocks,nThreads,devTestMem,loopIters,ones,shift,devTempMem,hostTempMem);
138 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
140 bool memtestState::gpuRandomBlocks(uint& errorCount,const uint seed) const {
141 if (!allocated) return false;
142 errorCount = ::gpuRandomBlocks(nBlocks,nThreads,devTestMem,loopIters,seed,devTempMem,hostTempMem);
143 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
145 bool memtestState::gpuModuloX(uint& errorCount,const uint shift,const uint pattern,const uint modulus,const uint overwriteIters) const {
146 if (!allocated) return false;
147 errorCount = ::gpuModuloX(nBlocks,nThreads,devTestMem,loopIters,shift,pattern,modulus,overwriteIters,devTempMem,hostTempMem);
148 return ((cudaGetLastError() == cudaSuccess) && (errorCount != 0xFFFFFFFF) && (errorCount != 0xFFFFFFFE));
153 __global__ void deviceWriteConstant(uint* base, uint N, const uint constant);
154 __global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount);
155 __global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period);
156 __global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period);
157 __global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed);
158 __global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount);
159 __global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift);
160 __global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount);
161 __global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1);
162 __global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount);
163 __global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters);
164 __global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount);
167 // Utility function to measure memory bandwidth
168 __host__ double gpuMemoryBandwidth(uint* src,uint* dst,uint mbToTest,uint iters) {
169 uint start = getTimeMilliseconds();
170 for (uint i = 0; i < iters; i++) {
171 cudaMemcpy(dst,src,mbToTest*1048576,cudaMemcpyDeviceToDevice);
173 //D-to-D memory copies are non-blocking, so sync to get correct timing
174 cudaThreadSynchronize();
176 uint end = getTimeMilliseconds();
178 // Calculate bandwidth in MiB/s
179 // Multiply by 2 since we are reading and writing to the same memory
180 double bw = 2.0*((double)mbToTest*iters)/((end-start)/1000.0);
184 // Utility functions to write/verify pure constants in memory, CPU/GPU {{{
185 __host__ void gpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
186 deviceWriteConstant<<<nBlocks,nThreads>>>(base,N,constant);
189 __global__ void deviceWriteConstant(uint* base, uint N, const uint constant) {
190 for (uint i = 0 ; i < N; i++) {
191 *(THREAD_ADDRESS(base,N,i)) = constant;
195 __host__ uint gpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant,uint* blockErrorCount,uint* errorCounts) { //{{{
196 // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
198 deviceVerifyConstant<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,constant,blockErrorCount);
199 CHECK_LAUNCH_ERROR();
201 CHECK_LAUNCH_ERROR();
203 cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
205 // Sum-reduce block error counts on the host - it's only order of 1k numbers.
206 uint totalErrors = 0;
207 for (uint i = 0; i < nBlocks; i++) {
208 totalErrors += errorCounts[i];
213 __global__ void deviceVerifyConstant(uint* base,uint N,const uint constant,uint* blockErrorCount) {
214 // Verifies memory at base to make sure it has a constant pattern
215 // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
216 // Sum-reduce this array afterwards to get total error count over tested region
217 // Uses 4*blockDim.x bytes of shared memory
219 extern __shared__ uint threadErrorCount[];
220 threadErrorCount[threadIdx.x] = 0;
222 for (uint i = 0; i < N; i++) {
223 //if ( *(THREAD_ADDRESS(base,N,i)) != constant ) threadErrorCount[threadIdx.x]++;
224 threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),constant);
226 // Parallel-reduce error counts over threads in block
227 for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
229 if (threadIdx.x < stride)
230 threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
234 if (threadIdx.x == 0)
235 blockErrorCount[blockIdx.x] = threadErrorCount[0];
241 __host__ void cpuWriteConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
242 dim3 blockDim(nThreads,0,0);
243 dim3 threadIdx(0,0,0);
244 dim3 blockIdx(0,0,0);
245 for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
246 for (uint i = 0; i < N; i++) {
247 for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
248 *(THREAD_ADDRESS(base,N,i)) = constant;
254 __host__ uint cpuVerifyConstant(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint constant) { //{{{
255 dim3 blockDim(nThreads,0,0);
256 dim3 threadIdx(0,0,0);
257 dim3 blockIdx(0,0,0);
259 for (blockIdx.x = 0; blockIdx.x < nBlocks; blockIdx.x++) {
260 for (uint i = 0; i < N; i++) {
261 for (threadIdx.x = 0; threadIdx.x < blockDim.x; threadIdx.x++) {
262 if (*(THREAD_ADDRESS(base,N,i)) != constant) errorCount++;
272 // Idea: Run a varying number of iterations (k*N) of a short-period (per=N) LCG that returns to zero (or F's) quickly {{{
273 // Store only the result of the last iteration
274 // Compare output to the desired constant
275 // Compare results between varying k - memory error rate for a given pattern should be constant,
276 // so variation should be due to logic errors in loop count
277 __host__ uint gpuShortLCG0(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
278 deviceShortLCG0<<<nBlocks,nThreads>>>(base,N,repeats,period);
279 CHECK_LAUNCH_ERROR();
281 CHECK_LAUNCH_ERROR();
282 return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
285 __host__ uint gpuShortLCG0Shmem(const uint nBlocks,const uint nThreads,uint* base,uint N,const uint repeats,const int period,uint* blockErrorCounts,uint* errorCounts) { //{{{
286 deviceShortLCG0Shmem<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,repeats,period);
287 CHECK_LAUNCH_ERROR();
289 CHECK_LAUNCH_ERROR();
290 return gpuVerifyConstant(nBlocks,nThreads,base,N,0,blockErrorCounts,errorCounts);
293 // Put the LCG loop into a macro so we don't repeat code between versions of logic tester.
294 // The paired XOR adds diversity to the instruction stream, and is not reduced to a NOT
295 // as a single XOR is (verified with decuda).
297 #if defined (LINUX) || defined(OSX)
298 #define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
301 for (uint iter = 0; iter < period; iter++) {\
303 (var) = (a)*(var)+(c);\
304 (var) ^= 0xFFFFFFF0;\
309 #elif defined (WINDOWS) || defined (WINNV)
310 #define LCGLOOP(var,repeats,period,a,c) for (uint rep = 0; rep < repeats; rep++) {\
312 __pragma("unroll 1")\
313 for (uint iter = 0; iter < period; iter++) {\
315 (var) = (a)*(var)+(c);\
316 (var) ^= 0xFFFFFFF0;\
324 __global__ void deviceShortLCG0(uint* base,uint N,uint repeats,const int period) { //{{{
325 // Pick a different block for different LCG lengths
326 // Short periods are useful if LCG goes inside for i in 0..N loop
329 case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
330 case 512: a = 0x61c8647f; c = 0x2b3e0000; break;
331 case 256: a = 0x7161ac7f; c = 0x43840000; break;
332 case 128: a = 0x0432b47f; c = 0x1ce80000; break;
333 case 2048: a = 0x763fffff; c = 0x4769466f; break;
334 default: a = 0; c = 0; break;
338 LCGLOOP(value,repeats,period,a,c)
340 for (uint i = 0 ; i < N; i++) {
341 *(THREAD_ADDRESS(base,N,i)) = value;
344 // _shmem version uses shared memory to store inter-iteration values
345 // is more sensitive to shared memory errors from (eg) shader overclocking
346 __global__ void deviceShortLCG0Shmem(uint* base,uint N,uint repeats,const int period) { //{{{
347 // Pick a different block for different LCG lengths
348 // Short periods are useful if LCG goes inside for i in 0..N loop
350 extern __shared__ uint shmem[];
352 case 1024: a = 0x0fbfffff; c = 0x3bf75696; break;
353 case 512: a = 0x61c8647f; c = 0x2b3e0000; break;
354 case 256: a = 0x7161ac7f; c = 0x43840000; break;
355 case 128: a = 0x0432b47f; c = 0x1ce80000; break;
356 case 2048: a = 0x763fffff; c = 0x4769466f; break;
357 default: a = 0; c = 0; break;
359 shmem[threadIdx.x] = 0;
360 LCGLOOP(shmem[threadIdx.x],repeats,period,a,c)
362 for (uint i = 0 ; i < N; i++) {
363 *(THREAD_ADDRESS(base,N,i)) = shmem[threadIdx.x];
369 // Memtest86 Test 2: tseq=0,4
370 __host__ uint gpuMovingInversionsOnesZeros(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
373 gpuWriteConstant(nBlocks,nThreads,base,N,0xFFFFFFFF);
374 CHECK_LAUNCH_ERROR();
376 CHECK_LAUNCH_ERROR();
378 errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,0xFFFFFFFF,blockErrorCounts,errorCounts);
379 CHECK_LAUNCH_ERROR();
381 gpuWriteConstant(nBlocks,nThreads,base,N,0x0);
382 CHECK_LAUNCH_ERROR();
384 CHECK_LAUNCH_ERROR();
386 errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,0x0,blockErrorCounts,errorCounts);
387 CHECK_LAUNCH_ERROR();
391 // Memtest86 Test 3: tseq=1
392 __host__ uint gpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift,uint* blockErrorCounts,uint* errorCounts) { //{{{
393 // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
394 // written into each 32-bit word in memory, verified, and its complement written and verified
396 uint pattern = 1 << shift;
397 pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
400 gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
401 CHECK_LAUNCH_ERROR();
403 CHECK_LAUNCH_ERROR();
405 errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
406 CHECK_LAUNCH_ERROR();
409 gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
410 CHECK_LAUNCH_ERROR();
412 CHECK_LAUNCH_ERROR();
414 errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
415 CHECK_LAUNCH_ERROR();
418 __host__ uint cpuWalking8BitM86(const uint nBlocks,const uint nThreads,uint* base,uint N,uint shift) { //{{{
419 // Performs the Memtest86 variation on the walking 8-bit pattern, where the same shifted pattern is
420 // written into each 32-bit word in memory, verified, and its complement written and verified
422 uint pattern = 1 << shift;
423 pattern = pattern | (pattern << 8) | (pattern << 16) | (pattern << 24);
426 cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
427 errorCount = cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
430 cpuWriteConstant(nBlocks,nThreads,base,N,pattern);
431 errorCount += cpuVerifyConstant(nBlocks,nThreads,base,N,pattern);
435 __host__ uint gpuWalking8Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
436 // Implements one iteration of true walking 8-bit ones/zeros test
437 uint patterns[2]={0x0,0x0};
439 // Build the walking-ones paired pattern of 8-bits with the given shift
441 uint bits = 0x1 << shift;
442 for (uint i = 0; i < 4; i++) {
443 patterns[0] = (patterns[0] << 8) | bits;
444 bits = (bits == 0x80) ? 0x01 : bits<<1;
446 for (uint i = 0; i < 4; i++) {
447 patterns[1] = (patterns[1] << 8) | bits;
448 bits = (bits == 0x80) ? 0x01 : bits<<1;
452 patterns[0] = ~patterns[0];
453 patterns[1] = ~patterns[1];
456 //printf("Host Patterns: %08x %08x\n",patterns[0],patterns[1]);
457 deviceWritePairedConstants<<<nBlocks,nThreads>>>(base,N,patterns[0],patterns[1]);
458 CHECK_LAUNCH_ERROR();
460 CHECK_LAUNCH_ERROR();
461 //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
462 //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
463 // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
464 deviceVerifyPairedConstants<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,patterns[0],patterns[1],blockErrorCount);
465 CHECK_LAUNCH_ERROR();
467 CHECK_LAUNCH_ERROR();
468 //if (cudaGetLastError() != cudaSuccess) {
469 // return 0xFFFFFFFF; // -1
471 //uint errorCounts[nBlocks];
472 cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
474 // Sum-reduce block error counts on the host - it's only order of 1k numbers.
475 uint totalErrors = 0;
476 for (uint i = 0; i < nBlocks; i++) {
477 totalErrors += errorCounts[i];
482 __global__ void deviceWritePairedConstants(uint* base,uint N,uint pattern0,uint pattern1) {
483 // Writes paired constants to memory, such that each offset that is X mod 2 receives patterns[X]
484 // Used for true walking-ones/zeros 8-bit test
485 //if (threadIdx.x == 0)
486 // printf("Device Patterns Block %u: %08x %08x\n",blockIdx.x,patterns[0],patterns[1]);
487 const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
488 //const uint pattern = patterns[threadIdx.x & 0x1];
489 for (uint i = 0 ; i < N; i++) {
490 *(THREAD_ADDRESS(base,N,i)) = pattern;
491 //*(base+blockIdx.x*N*blockDim.x + i*blockDim.x + threadIdx.x) = 0;
496 __global__ void deviceVerifyPairedConstants(uint* base,uint N,uint pattern0,uint pattern1,uint* blockErrorCount) {
497 // Verifies memory at base to make sure it has a correct paired-constant pattern
498 // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
499 // Sum-reduce this array afterwards to get total error count over tested region
500 // Uses 4*blockDim.x bytes of shared memory
502 extern __shared__ uint threadErrorCount[];
503 threadErrorCount[threadIdx.x] = 0;
504 //const uint pattern = patterns[threadIdx.x & 0x1];
505 const uint pattern = (threadIdx.x & 0x1) ? pattern1 : pattern0;
507 for (uint i = 0; i < N; i++) {
508 //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
509 threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
511 // Parallel-reduce error counts over threads in block
512 for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
514 if (threadIdx.x < stride)
515 threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
519 if (threadIdx.x == 0)
520 blockErrorCount[blockIdx.x] = threadErrorCount[0];
526 // Memtest86 Test 4: tseq=10
527 __host__ uint gpuMovingInversionsRandom(const uint nBlocks,const uint nThreads,uint* base,uint N,uint* blockErrorCounts,uint* errorCounts) { //{{{
531 uint pattern = (uint)rand();
532 gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
533 CHECK_LAUNCH_ERROR();
535 CHECK_LAUNCH_ERROR();
537 errorCount = gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
538 CHECK_LAUNCH_ERROR();
541 gpuWriteConstant(nBlocks,nThreads,base,N,pattern);
542 CHECK_LAUNCH_ERROR();
544 CHECK_LAUNCH_ERROR();
546 errorCount += gpuVerifyConstant(nBlocks,nThreads,base,N,pattern,blockErrorCounts,errorCounts);
547 CHECK_LAUNCH_ERROR();
551 // Memtest86 Test 6: tseq=2
552 __host__ uint gpuWalking32Bit(const uint nBlocks,const uint nThreads,uint* base,uint N,bool ones,uint shift,uint* blockErrorCount,uint* errorCounts) { //{{{
553 // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
554 // Does one iteration of the walking-{ones/zeros} 32-bit test paralleling Memtest
555 // With the starting pattern 1<<shift
556 // NUMBER OF THREADS SHOULD BE A MULTIPLE OF 32
558 deviceWriteWalking32Bit<<<nBlocks,nThreads>>>(base,N,ones,shift);
559 CHECK_LAUNCH_ERROR();
561 CHECK_LAUNCH_ERROR();
563 deviceVerifyWalking32Bit<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,ones,shift,blockErrorCount);
564 CHECK_LAUNCH_ERROR();
566 CHECK_LAUNCH_ERROR();
568 cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
570 // Sum-reduce block error counts on the host - it's only order of 1k numbers.
571 uint totalErrors = 0;
572 for (uint i = 0; i < nBlocks; i++) {
573 totalErrors += errorCounts[i];
579 __global__ void deviceWriteWalking32Bit(uint* base,uint N,bool ones,uint shift) {
580 // Writes one iteration of the walking-{ones/zeros} 32-bit pattern to gpu memory
582 // Want to write in a 1 << (offset from base + shift % 32)
583 // Since thread indices are aligned with base, this reduces to
584 // 1 << ((threadIdx.x+shift) & 0x1f)
585 // With conditional inversion for walking zeros
586 uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
587 pattern = ones ? pattern : ~pattern;
589 for (uint i = 0; i < N; i++) {
590 *(THREAD_ADDRESS(base,N,i)) = pattern;
594 __global__ void deviceVerifyWalking32Bit(uint* base,uint N,bool ones,uint shift,uint* blockErrorCount) {
595 // Verifies memory at base to make sure it has a constant pattern
596 // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
597 // Sum-reduce this array afterwards to get total error count over tested region
598 // Uses 4*blockDim.x bytes of shared memory
600 extern __shared__ uint threadErrorCount[];
601 threadErrorCount[threadIdx.x] = 0;
603 uint pattern = 1 << ((threadIdx.x + shift) & 0x1f);
604 pattern = ones ? pattern : ~pattern;
606 for (uint i = 0; i < N; i++) {
607 //if ( *(THREAD_ADDRESS(base,N,i)) != pattern ) threadErrorCount[threadIdx.x]++;
608 threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),pattern);
610 // Parallel-reduce error counts over threads in block
611 for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
613 if (threadIdx.x < stride)
614 threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
618 if (threadIdx.x == 0)
619 blockErrorCount[blockIdx.x] = threadErrorCount[0];
625 // Memtest86 Test 7: tseq=9
626 __host__ uint gpuRandomBlocks(const uint nBlocks,const uint nThreads,uint* base,uint N,uint seed,uint* blockErrorCount,uint* errorCounts) { //{{{ {{{
627 // Writes random numbers into memory and verifies pattern
628 //uint errorCounts[nBlocks];
630 deviceWriteRandomBlocks<<<nBlocks,nThreads,4*nThreads>>>(base,N,seed);
631 CHECK_LAUNCH_ERROR();
633 CHECK_LAUNCH_ERROR();
635 //cudaMemcpy(errorCounts,base,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
636 //printf("First few words in tested RAM: %08x %08x %08x %08x %08x %08x\n",errorCounts[0],errorCounts[1],errorCounts[2],errorCounts[3],errorCounts[4],errorCounts[5]);
638 deviceVerifyRandomBlocks<<<nBlocks,nThreads,12*nThreads>>>(base,N,seed,blockErrorCount);
639 CHECK_LAUNCH_ERROR();
641 CHECK_LAUNCH_ERROR();
644 cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
646 // Sum-reduce block error counts on the host - it's only order of 1k numbers.
647 uint totalErrors = 0;
648 for (uint i = 0; i < nBlocks; i++) {
649 totalErrors += errorCounts[i];
655 // Math functions modulo the Mersenne prime 2^31 -1 {{{
656 __device__ void deviceMul3131 (uint v1, uint v2,uint& LO, uint& HI)
658 // Given v1, v2 < 2^31
659 // Emulate a 31-bit integer multiply by doing instead a 32-bit multiply into LO and HI
660 // And shifting bits around to make it look right.
662 HI = __umulhi(v1,v2);
664 HI |= (LO & 0x80000000) >> 31;
669 __device__ uint deviceModMP31(uint LO,uint HI) {
670 // Modulo a 62-bit number HI<<31 + LO, mod 2^31-1
671 // Encyclopedia of Cryptography and Security By Henk C. A. van Tilborg
672 // page 381, Mersenne Primes
674 if (sum >= 0x80000000) {
675 // If a+b > 2^31, then high bit will be set
676 return sum - 0x80000000 + 1;
681 __device__ uint deviceMulMP31(uint a,uint b) {
682 // Multiplies a pair of 31-bit integers a and b mod the Mersenne prime 2^31-1
683 // Takes result through a 62-bit intermediate
685 deviceMul3131(a,b,LO,HI);
686 return deviceModMP31(LO,HI);
689 __device__ uint deviceExpoModMP31(uint base,uint exponent) {
691 while (exponent > 0) {
693 result = deviceMulMP31(result,base);
696 base = deviceMulMP31(base,base);
701 // deviceRan0p: Parallelized closed-form version of NR's ran0 {{{
702 __device__ uint deviceRan0p(int seed,int n) { //
703 uint an = deviceExpoModMP31(16807,n+1);
704 return deviceMulMP31(an,seed);
707 // deviceIrbit2: random bit generation, from NR {{{
708 __device__ int deviceIrbit2(uint& seed) {
712 const uint IB18 = 131072;
713 const uint MASK = IB1+IB2+IB5;
715 seed = ((seed ^ MASK) << 1) | IB1;
723 __global__ void deviceWriteRandomBlocks(uint* base,uint N,int seed) { //{{{
724 // Requires 4*nThreads bytes of shared memory
725 extern __shared__ uint randomBlock[];
727 // Make sure seed is not zero.
728 if (seed == 0) seed = 123459876+blockIdx.x;
729 uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
731 for (uint i=0; i < N; i++) {
732 // Generate a block of random numbers in parallel using closed-form expression for ran0
733 // OR in a random bit because Ran0 will never have the high bit set
734 randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
737 // Set the seed for the next round to the last number calculated in this round
738 seed = randomBlock[blockDim.x-1];
740 // Blit shmem block out to global memory
741 *(THREAD_ADDRESS(base,N,i)) = randomBlock[threadIdx.x];
745 __global__ void deviceVerifyRandomBlocks(uint* base,uint N,int seed,uint* blockErrorCount) { //{{{
746 // Verifies memory at base to make sure it has a correct random pattern given the seed
747 // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
748 // Sum-reduce this array afterwards to get total error count over tested region
749 // Uses 12*blockDim.x bytes of shared memory
751 extern __shared__ uint shmem[];
752 uint* threadErrorCount = shmem;
753 uint* randomBlock = shmem + blockDim.x;
754 // Put these into shmem to cut register count
755 uint* bitSeeds = randomBlock + blockDim.x;
757 threadErrorCount[threadIdx.x] = 0;
759 // Make sure seed is not zero.
760 if (seed == 0) seed = 123459876+blockIdx.x;
761 //uint bitSeed = deviceRan0p(seed + threadIdx.x,threadIdx.x);
762 bitSeeds[threadIdx.x] = deviceRan0p(seed + threadIdx.x,threadIdx.x);
764 for (uint i = 0; i < N; i++) {
765 // Generate a block of random numbers in parallel using closed-form expression for ran0
766 // OR in a random bit because Ran0 will never have the high bit set
767 //randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeed) << 31);
768 randomBlock[threadIdx.x] = deviceRan0p(seed,threadIdx.x) | (deviceIrbit2(bitSeeds[threadIdx.x]) << 31);
771 // Set the seed for the next round to the last number calculated in this round
772 seed = randomBlock[blockDim.x-1];
774 //if ( randomBlock[threadIdx.x] != *(THREAD_ADDRESS(base,N,i))) threadErrorCount[threadIdx.x]++;
775 threadErrorCount[threadIdx.x] += BITSDIFF(*(THREAD_ADDRESS(base,N,i)),randomBlock[threadIdx.x]);
779 // Parallel-reduce error counts over threads in block
780 for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
782 if (threadIdx.x < stride)
783 threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
787 if (threadIdx.x == 0)
788 blockErrorCount[blockIdx.x] = threadErrorCount[0];
795 // Memtest86 Test 8: tseq=3 (M86 uses modulus = 20)
796 __host__ uint gpuModuloX(const uint nBlocks,const uint nThreads,uint* base,const uint N,uint shift,uint pattern1,const uint modulus,const uint iters,
797 uint* blockErrorCount,uint* errorCounts) { //{{{
798 // Given device arrays base (tested memory) and blockErrorCount (nBlocks uints in length of temp space)
799 // Given a shift, modulus, pattern to test and number of overwrite iterations
800 // Performs Modulo-X test on memory
802 //uint errorCounts[nBlocks];
803 uint totalErrors = 0;
806 // Test both the given pattern and its inverse
807 for (uint i = 0; i < 2; i++, pattern1 = ~pattern1) {
808 deviceWritePairedModulo<<<nBlocks,nThreads>>>(base,N,shift,pattern1,~pattern1,modulus,iters);
809 CHECK_LAUNCH_ERROR();
811 CHECK_LAUNCH_ERROR();
813 deviceVerifyPairedModulo<<<nBlocks,nThreads,sizeof(uint)*nThreads>>>(base,N,shift,pattern1,modulus,blockErrorCount);
814 CHECK_LAUNCH_ERROR();
816 CHECK_LAUNCH_ERROR();
818 cudaMemcpy(errorCounts,blockErrorCount,sizeof(uint)*nBlocks,cudaMemcpyDeviceToHost);
820 // Sum-reduce block error counts on the host - it's only order of 1k numbers.
821 for (uint i = 0; i < nBlocks; i++) {
822 totalErrors += errorCounts[i];
828 __global__ void deviceWritePairedModulo(uint* base,const uint N,const uint shift,const uint pattern1,const uint pattern2,const uint modulus,const uint iters) {
829 // First writes pattern1 into every offset that is 0 mod modulus
830 // Next (iters times) writes ~pattern1 into every other address
832 for (uint i = 0 ; i < N; i++) {
833 offset = THREAD_OFFSET(N,i);
834 if ((offset % modulus) == shift) *(base+offset) = pattern1;
837 for (uint j = 0; j < iters; j++) {
838 for (uint i = 0 ; i < N; i++) {
839 offset = THREAD_OFFSET(N,i);
840 if ((offset % modulus) != shift) *(base+offset) = pattern2;
844 __global__ void deviceVerifyPairedModulo(uint* base,uint N,const uint shift,const uint pattern1,const uint modulus,uint* blockErrorCount) {
845 // Verifies that memory at each (offset mod modulus == shift) stores pattern1
846 // Sums number of errors found in block and stores error count into blockErrorCount[blockIdx.x]
847 // Sum-reduce this array afterwards to get total error count over tested region
848 // Uses 4*blockDim.x bytes of shared memory
850 extern __shared__ uint threadErrorCount[];
851 threadErrorCount[threadIdx.x] = 0;
854 for (uint i = 0; i < N; i++) {
855 offset = THREAD_OFFSET(N,i);
856 //if (((offset % modulus) == shift) && (*(base+offset) != pattern1)) threadErrorCount[threadIdx.x]++;
857 if ((offset % modulus) == shift) threadErrorCount[threadIdx.x] += BITSDIFF(*(base+offset),pattern1);
859 // Parallel-reduce error counts over threads in block
860 for (uint stride = blockDim.x>>1; stride > 0; stride >>= 1) {
862 if (threadIdx.x < stride)
863 threadErrorCount[threadIdx.x] += threadErrorCount[threadIdx.x + stride];
867 if (threadIdx.x == 0)
868 blockErrorCount[blockIdx.x] = threadErrorCount[0];