2017年01月22日

【CUDA】ループ展開とシェアードメモリでフィルタ演算高速化

以下の記事のコードに対して、ループ展開とシェアードメモリによる高速化を試みました。
【CUDA】グローバルメモリとテクスチャメモリの速度比較

以下の修士論文を参考にさせていただきました。
GPUを用いた画像処理の高速化

コードは以下の通り。

const int blockSize = 32;
const int blockSizeWithoutEdge = blockSize - 2;
const int blockSize1d = blockSize * blockSize;

// 平均値フィルタ(GlobalMemory)
__global__ void meanFilterKernel(unsigned char *out, const unsigned char *in,
	const int width, const int height, const int filterRadius, const float fscale)
{
	if (blockIdx.x - filterRadius < 0 || blockIdx.x + filterRadius >= height) return;

	for (int i = threadIdx.x; i < width; i += blockDim.x) {
		int sum = 0;
		for (int fy = -filterRadius; fy <= filterRadius; ++fy) {
			int y = blockIdx.x + fy;
			for (int fx = -filterRadius; fx <= filterRadius; ++fx) {
				sum += in[y * width + i + fx];
			}
		}
		out[blockIdx.x * width + i] = sum * fscale;
	}
}

// 平均値フィルタ（GlobalMemory, Unwind）
__global__ void meanFilterKernelUnwind(unsigned char *out, const unsigned char *in,
	const int width, const int height)
{
	if (blockIdx.x == 0 || blockIdx.x == height - 1) return;

	const float scale = 0.1111111111;
	for (int i = threadIdx.x; i < width; i += blockDim.x) {
		int sum = 0;
		const unsigned char* tmp = in + (blockIdx.x - 1) * width + i - 1;
		sum += *tmp++;
		sum += *tmp++;
		sum += *tmp;
		tmp += width - 2;
		sum += *tmp++;
		sum += *tmp++;
		sum += *tmp;
		tmp += width - 2;
		sum += *tmp++;
		sum += *tmp++;
		sum += *tmp;
		out[blockIdx.x * width + i] = sum * scale;
	}
}

// 平均値フィルタ（SharedMemory, Unwind）
__constant__ int dTopLeftIndex[8000];
__global__ void meanFilterKernelUnwindTileShared(unsigned char *out,
	const unsigned char *in, const int width)
{
	__shared__  unsigned char tile[blockSize1d];
	__shared__ int topLeftIndex;
	int indexInTile = threadIdx.y * blockDim.x + threadIdx.x;

	if (threadIdx.x == 0 && threadIdx.y == 0) topLeftIndex = dTopLeftIndex[blockIdx.x];
	__syncthreads();

	int indexInImage = topLeftIndex + threadIdx.y * width + threadIdx.x;
	tile[indexInTile] = in[indexInImage];
	__syncthreads();

	if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 ||
		threadIdx.y == 0 || threadIdx.y == blockDim.y - 1) return;

	const float scale = 0.1111111111;
	int sum = 0;
	const unsigned char* tmp = tile + indexInTile - blockDim.x - 1;
	sum += *tmp++;
	sum += *tmp++;
	sum += *tmp;
	tmp += blockDim.x - 2;
	sum += *tmp++;
	sum += *tmp++;
	sum += *tmp;
	tmp += blockDim.x - 2;
	sum += *tmp++;
	sum += *tmp++;
	sum += *tmp;
	out[indexInImage] = sum * scale;
}

void main()
{
	cv::Mat img = cv::imread("Room_02000.bmp", 0);
	int pixelNum = img.cols * img.rows;
	int dataSize = pixelNum * sizeof(unsigned char);

	unsigned char *hOut, *dIn, *dOut;
	hOut = new unsigned char[pixelNum];
	checkCudaErrors(cudaMalloc((unsigned char**)&dIn, dataSize));
	checkCudaErrors(cudaMalloc((unsigned char**)&dOut, dataSize));
	checkCudaErrors(cudaMemcpy(dIn, img.data, dataSize, cudaMemcpyHostToDevice));

	cudaArray *cu_array;
	cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
	checkCudaErrors(cudaMallocArray(&cu_array, &desc, img.cols, img.rows));
	checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, img.data, dataSize,
		cudaMemcpyHostToDevice));
	checkCudaErrors(cudaBindTextureToArray(tex, cu_array));

	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);

	int filterRadius = 1;
	int filterDiameter = filterRadius * 2 + 1;
	float fscale = 1.0f / filterDiameter / filterDiameter;

	// CPU
	LARGE_INTEGER start, end;
	QueryPerformanceCounter(&start);
	for (int h = filterRadius; h < img.rows - filterRadius; ++h) {
		for (int w = filterRadius; w < img.cols - filterRadius; ++w) {
			int sum = 0;
			for (int fy = -filterRadius; fy <= filterRadius; ++fy) {
				int y = h + fy;
				for (int x = -filterRadius; x <= filterRadius; ++x) {
					sum += img.data[y * img.cols + w + x];
				}
			}
			hOut[h * img.cols + w] = sum * fscale;
		}
	}
	QueryPerformanceCounter(&end);
	printf("CPU:%lf[ms]\n", (double)(end.QuadPart - start.QuadPart)
		/ freq.QuadPart * 1000.0);
	//memcpy(img.data, hOut, dataSize);

	// GPU(GlobalMemory)
	dim3 block(1024);
	dim3 grid(img.rows);
	int n = 100;
	QueryPerformanceCounter(&start);
	for (int i = 0; i < n; ++i) {
		meanFilterKernel << <grid, block >> >(dOut, dIn, img.cols, img.rows, 
			filterRadius, fscale);
		checkCudaErrors(cudaDeviceSynchronize());
	}
	QueryPerformanceCounter(&end);
	printf("GPU(GlobalMemory):%lf[ms]\n", (double)(end.QuadPart - start.QuadPart)
		/ freq.QuadPart * 1000.0 / n);
	//checkCudaErrors(cudaMemcpy(img.data, dOut, dataSize, cudaMemcpyDeviceToHost));

	// GPU(GlobalMemory, Unwind)
	QueryPerformanceCounter(&start);
	for (int i = 0; i < n; ++i) {
		meanFilterKernelUnwind << <grid, block >> >(dOut, dIn, img.cols, img.rows);
		checkCudaErrors(cudaDeviceSynchronize());
	}
	QueryPerformanceCounter(&end);
	printf("GPU(GlobalMemory, Unwind):%lf[ms]\n", (double)(end.QuadPart - start.QuadPart)
		/ freq.QuadPart * 1000.0 / n);
	//checkCudaErrors(cudaMemcpy(img.data, dOut, dataSize, cudaMemcpyDeviceToHost));

	//
	// GPU(SharedMemory, Unwind)
	//

	// ブロックの左上端の座標を記録
	int blockNumX = img.cols / blockSizeWithoutEdge;
	int blockNumY = img.rows / blockSizeWithoutEdge;

	vector<int> topLeftIndex, topLeftX, topLeftY;
	for (int by = 0; by < blockNumY; ++by) {
		int y = by * blockSizeWithoutEdge;
		for (int bx = 0; bx < blockNumX; ++bx) {
			int x = bx * blockSizeWithoutEdge;
			topLeftIndex.push_back(y * img.cols + x);
		}
	}

	// 右端ブロック
	if (img.cols % blockSizeWithoutEdge != 0) {
		int x = img.cols - blockSize;
		for (int by = 0; by < blockNumY; ++by) {
			int y = by * blockSizeWithoutEdge;
			topLeftIndex.push_back(y * img.cols + x);
		}
	}
	// 下端ブロック
	if (img.rows % blockSizeWithoutEdge != 0) {
		int y = img.rows - blockSize;
		for (int bx = 0; bx < blockNumX; ++bx) {
			int x = bx * blockSizeWithoutEdge;
			topLeftIndex.push_back(y * img.cols + x);
		}
	}
	// 右下端ブロック
	if (img.cols % blockSizeWithoutEdge != 0 && img.rows % blockSizeWithoutEdge != 0) {
		int x = img.cols - blockSize;
		int y = img.rows - blockSize;
		topLeftIndex.push_back(y * img.cols + x);
	}

	checkCudaErrors(cudaMemcpyToSymbol(dTopLeftIndex, &topLeftIndex[0], sizeof(int) 
		* topLeftIndex.size()));
	
        dim3 blockTile(blockSize, blockSize);
	dim3 gridTile(topLeftIndex.size());
	QueryPerformanceCounter(&start);
	for (int i = 0; i < n; ++i) {
		meanFilterKernelUnwindTileShared << <gridTile, blockTile >> >(dOut, dIn, img.cols);
		checkCudaErrors(cudaDeviceSynchronize());
	}
	QueryPerformanceCounter(&end);
	printf("GPU(SharedMemory, Unwind):%lf[ms]\n", (double)(end.QuadPart - start.QuadPart)
		/ freq.QuadPart * 1000.0 / n);
	//checkCudaErrors(cudaMemcpy(img.data, dOut, dataSize, cudaMemcpyDeviceToHost));
}

結果は以下の通り。

ループ展開によって処理時間が約2/3になりましたが、シェアードメモリを利用しても
それ以上は速くなりませんでした。

以上です。

※追記
フィルタサイズを大きくすると、シェアードメモリが相対的に速くなりました。
3x3の場合は元々グローバルメモリへのアクセス回数が少ないので、シェアードメモリの効果が見えにくかったということですね。

2017年01月22日

【メモ】（ポインタの）後置インクリメント

void main()
{
   int A[5] = { 1, 2, 3, 4, 5 };
   int* pA = A;
   for (int i = 0; i < 2; ++i) {
       cout << *(pA++) + *(pA++) << endl;
   }
}

*(pA++)でも、*pA++でも、2 6と出力される。
つまり、後置インクリメントはその行の命令が全て実行されてから、値が更新される。

2017年01月21日

CUDAの基本概念に関する世代間の共通点と相違点

CUDAは世代間でアーキテクチャが大きく異なり、勉強していると混乱することがよくある。
そこで、基本概念に関する世代間の共通点と相違点をまとめてみた。

〇共通点
・グリッド、ブロック、スレッド
ブログラム実行に関わる論理的概念（物理的なものではない）。
スレッドは、プログラムの最小単位。
ブロックは、複数のスレッドをまとめたもの。グリッドは、複数のブロックをまとめたもの。

・WARP
32スレッドを束ねたもの（論理的概念）であり、プログラムはWARP単位で実行される。

・CUDAコア
1スレッドの処理を担当する。
ストリーミングプロセッサ（SP）とも呼ばれる。

・ストリーミングマルチプロセッサ（SM）
複数のCUDAコアをまとめたもの。1ブロックの処理を担当する。
世代によって呼び方が異なる（後述）。

〇相違点
・Fermi世代
SMを32個のCUDAコアで構成する。
GTX480は15個のSMで構成されるので、CUDAコア数は32×15=480個。
Quadro6000は14個のSMで構成されるので、CUDAコア数は32×14=448個。

・Kepler世代
SMをSMXと呼ぶ。SMXを192個のCUDAコアで構成する。
GTX780は12個のSMXで構成されるので、CUDAコア数は192×12=2304個。
QuadroK6000は15個のSMXで構成されるので、CUDAコア数は192×15=2880個。

・Maxwell世代
SMをSMMと呼ぶ。SMMを128個のCUDAコアで構成する。
GTX980は16個のSMMで構成されるので、CUDAコア数は128×16=2048個。
QuadroM6000は24個のSMMで構成されるので、CUDAコア数は128×24=3072個。

以上です。