-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgaussianBlur.cu
160 lines (123 loc) · 5.73 KB
/
gaussianBlur.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <algorithm>
#include <vector>
#include <cmath>
#include <numeric>
using namespace std;
// error checking
// gaussian kernel is seperable; means we can perform 2d blur by doing 1d blur twice
// one 1d horizontal blur and another 1d vertical blur on the horizontal blur above
inline cudaError_t checkCudaErrors(cudaError_t error){
if(error != cudaSuccess){
fprintf(stderr, "CUDA ERROR = %d: %s (%s:%d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
return error;
}
vector<float> generateGaussianKernel1D(int kernelSize, float sigma){
vector<float> kernel(kernelSize);
int kernelRadius = kernelSize/2;
float sum = 0.0f;
for(int x = -kernelRadius; x <= kernelRadius; ++x){
float gaussianValue = exp(-(x * x) / (2.0f * sigma * sigma));
kernel[x + kernelRadius] = gaussianValue;
sum += gaussianValue;
}
// normalizing kernel
for(int i=0; i < kernelSize; ++i){
kernel[i] /= sum;
}
return kernel;
}
// horizontal blur
__global__
void gaussianBlurHorizontalKernel(const unsigned char* inputImage, unsigned char* intermediateImage, int width, int height, const float* kernel1D, int kernelSize){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row < height && col < width){
float blurredPixelValue = 0.0f;
int kernelRadius = kernelSize/2;
for(int kernelColOffset = -kernelRadius; kernelColOffset <= kernelRadius; ++kernelColOffset){
int neighbourCol = col + kernelColOffset;
int clampedCol = max(0, min(neighbourCol, width - 1));
blurredPixelValue += inputImage[row * width + clampedCol] * kernel1D[kernelColOffset + kernelRadius];
}
intermediateImage[row * width + col] = (unsigned char)blurredPixelValue;
}
}
// vertical blur on result of horizontal blur
__global__
void gaussianBlurVerticalKernel(const unsigned char* intermediateImage, unsigned char* outputImage,
int width, int height, const float* kernel1D, int kernelSize) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < height && col < width) {
float blurredPixelValue = 0.0f;
int kernelRadius = kernelSize / 2;
for (int kernelRowOffset = -kernelRadius; kernelRowOffset <= kernelRadius; ++kernelRowOffset) {
int neighborRow = row + kernelRowOffset;
int clampedRow = max(0, min(neighborRow, height - 1)); // Clamp row index only (vertical blur)
blurredPixelValue += intermediateImage[clampedRow * width + col] * kernel1D[kernelRowOffset + kernelRadius]; // Read from intermediateImage
}
outputImage[row * width + col] = (unsigned char)blurredPixelValue; // Write to final output image
}
}
int main(){
int width = 256;
int height = 256;
unsigned char* h_inputImage = new unsigned char[width * height];
unsigned char* h_outputImage = new unsigned char[width * height];
int kernelSize = 5;
float sigma = 1.5f;
vector<float> h_kernel1D = generateGaussianKernel1D(kernelSize, sigma);
int kernelLength = h_kernel1D.size();
float* d_kernel1D;
checkCudaErrors(cudaMalloc((void**)&d_kernel1D, kernelLength * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_kernel1D, h_kernel1D.data(), kernelLength * sizeof(float), cudaMemcpyHostToDevice));
unsigned char* d_intermediateImage;
checkCudaErrors(cudaMalloc((void**)&d_intermediateImage, width * height * sizeof(unsigned char)));
unsigned char* d_inputImage;
unsigned char* d_outputImage;
checkCudaErrors(cudaMalloc((void**)&d_inputImage, width * height * sizeof(unsigned char)));
checkCudaErrors(cudaMalloc((void**)&d_outputImage, width * height * sizeof(unsigned char)));
checkCudaErrors(cudaMemcpy(d_inputImage, h_inputImage, width * height * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1)/blockDim.x, (height + blockDim.y - 1)/blockDim.y);
// creating cuda events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// recording 'start' event at 0
cudaEventRecord(start, 0);
gaussianBlurHorizontalKernel<<<gridDim, blockDim>>>(d_inputImage, d_intermediateImage, width, height, d_kernel1D, kernelSize);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
gaussianBlurVerticalKernel<<<gridDim, blockDim>>>(d_intermediateImage, d_outputImage, width, height, d_kernel1D, kernelSize);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
// recording 'stop' event
cudaEventRecord(stop, 0);
// wait for stop event to finish to be recorded and gpu to finish
cudaEventSynchronize(stop);
checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost));
printf("first few blurred pixel values are \n");
for(int i =0; i < 10; ++i){
printf("%d ", h_outputImage[i]);
}
printf("\n");
// calculating execution time in milliseconds
float milliseconds;
cudaEventElapsedTime(&milliseconds, start, stop);
// milliseconds = milliseconds/100;
printf("\n Kernel Execution Time: %.3f milliseconds\n", milliseconds);
// freeing memory
checkCudaErrors(cudaFree(d_intermediateImage));
checkCudaErrors(cudaFree(d_kernel1D));
checkCudaErrors(cudaFree(d_inputImage));
checkCudaErrors(cudaFree(d_outputImage));
delete[] h_inputImage;
delete[] h_outputImage;
return 0;
}