udacity · duongdang · Mar 2, 2014 · Mar 2, 2014
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,11 @@
 build
 bin
 
+*png
+*jpg
+*\#*
+*\~*
+CMakeCache.txt
+CMakeFiles
+HW1/HW1
+HW2/HW2
diff --git a/HW1/Makefile b/HW1/Makefile
@@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include
 
 OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
 
-CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
+CUDA_INCLUDEPATH=/usr/local/cuda-5.5/include
 
 ######################################################
 # On Macs the default install locations are below    #

diff --git a/HW1/reference_calc.cpp b/HW1/reference_calc.cpp
@@ -1,7 +1,7 @@
 // for uchar4 struct
 #include <cuda_runtime.h>
 
-void referenceCalculation(const uchar4* const rgbaImage,
+inline void referenceCalculation(const uchar4* const rgbaImage,
                           unsigned char *const greyImage,
                           size_t numRows,
                           size_t numCols)

diff --git a/HW1/student_func.cu b/HW1/student_func.cu
@@ -31,7 +31,9 @@
 //You should fill in the kernel as well as set the block and grid sizes
 //so that the entire image is processed.
 
+#include "reference_calc.cpp"
 #include "utils.h"
+#include <stdio.h>
 
 __global__
 void rgba_to_greyscale(const uchar4* const rgbaImage,
@@ -50,17 +52,34 @@ void rgba_to_greyscale(const uchar4* const rgbaImage,
   //First create a mapping from the 2D block and grid locations
   //to an absolute 2D location in the image, then use that to
   //calculate a 1D offset
+  unsigned x = threadIdx.x*32 + blockIdx.x;
+  unsigned y = threadIdx.y*32 + blockIdx.y;
+
+  if (x >= numCols || y >= numRows) {
+      // printf("Out of bound: %d %d (limits are %d %d) \n", x, y, numCols, numRows);
+      return;
+  }
+  unsigned pixel_idx = y*numCols + x;
+  const uchar4 rgba = rgbaImage[pixel_idx];
+  greyImage[pixel_idx] = 0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z;
+  //printf("%d %d: %d %d %d --> %d \n", x, y, rgba.x, rgba.y, rgba.z, greyImage[pixel_idx]);    
 }
 
 void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
                             unsigned char* const d_greyImage, size_t numRows, size_t numCols)
 {
   //You must fill in the correct sizes for the blockSize and gridSize
   //currently only one block with one thread is being launched
-  const dim3 blockSize(1, 1, 1);  //TODO
-  const dim3 gridSize( 1, 1, 1);  //TODO
+  unsigned num_pixels = numRows*numCols;
+  unsigned bw_size = sizeof(unsigned char)*num_pixels;
+  unsigned color_size = sizeof(uchar4)*num_pixels;
+  //cudaMalloc((void**)&d_rgbaImage, color_size);
+  //cudaMalloc((void**)&d_greyImage, bw_size);
+  //cudaMemcpy(d_rgbaImage, h_rgbaImage, color_size, cudaMemcpyHostToDevice); 
+  const dim3 blockSize(numCols/32 + 1, numRows/32 + 1, 1);  //TODO
+  const dim3 gridSize(32, 32, 1);  //TODO
   rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
-
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  //cudaMemcpy(&h_greyImage, &d_greyImage, bw_size, cudaMemcpyDeviceToHost); 
 
 }
diff --git a/HW2/HW2.cpp b/HW2/HW2.cpp
@@ -110,6 +110,8 @@ void postProcess(const std::string& output_file, uchar4* data_ptr) {
   cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
   //output the image
   cv::imwrite(output_file.c_str(), imageOutputBGR);
+  // cv::imshow("Blurr image", imageOutputBGR);
+  // cv::waitKey(0);
 }
 
 void cleanUp(void)

diff --git a/HW2/Makefile b/HW2/Makefile
@@ -16,7 +16,7 @@ OPENCV_INCLUDEPATH=/usr/include
 #OPENCV_INCLUDEPATH=/usr/local/include
 
 OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
-CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
+CUDA_INCLUDEPATH=/usr/local/cuda-5.5/include
 
 ######################################################
 # On Macs the default install locations are below    #

diff --git a/HW2/student_func.cu b/HW2/student_func.cu
@@ -101,28 +101,58 @@
 //****************************************************************************
 
 #include "utils.h"
+#include "stdio.h"
+
+#define BLOCKWIDTH 32
 
 __global__
 void gaussian_blur(const unsigned char* const inputChannel,
                    unsigned char* const outputChannel,
                    int numRows, int numCols,
-                   const float* const filter, const int filterWidth)
-{
+                   const float* const filter, const int filterWidth) {
   // TODO
-
+  unsigned x = blockIdx.x*BLOCKWIDTH + threadIdx.x;
+  unsigned y = blockIdx.y*BLOCKWIDTH + threadIdx.y;
   // NOTE: Be sure to compute any intermediate results in floating point
   // before storing the final result as unsigned char.
 
   // NOTE: Be careful not to try to access memory that is outside the bounds of
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
-
+  if ( x >= numCols ||
+       y >= numRows )
+  {
+      return;
+  }
+  unsigned idx = y*numCols + x;
+  // outputChannel[idx] = inputChannel[idx];
+  float output(0);
+  unsigned center = (filterWidth + 1)/2;
+  for (size_t i = 0; i < filterWidth; ++i) {
+    for (size_t j = 0; j < filterWidth; ++j) {
+      int offset_x = i - center;
+      int offset_y = j - center;
+      int curr_x = x - offset_x;
+      int curr_y = y - offset_y;
+      if (curr_x < 0) {
+        curr_x = 0;
+      }
+      if (curr_x >= numCols) {
+        curr_x = numCols - 1;
+      }
+      if (curr_y < 0) {
+        curr_y = 0;
+      }
+      if (curr_y >= numRows) {
+        curr_y = numRows - 1;
+      }
+      unsigned curr_idx = curr_y*numCols + curr_x;
+      output += filter[j*filterWidth + i] * inputChannel[curr_idx];
+    }
+  }
+  //printf("idx: %d, output: %f\n", idx, output);
+  outputChannel[idx] = output;
   // NOTE: If a thread's absolute position 2D position is within the image, but some of
   // its neighbors are outside the image, then you will need to be extra careful. Instead
   // of trying to read such a neighbor value from GPU memory (which won't work because
@@ -152,6 +182,18 @@ void separateChannels(const uchar4* const inputImageRGBA,
   // {
   //     return;
   // }
+  unsigned x = blockIdx.x*BLOCKWIDTH + threadIdx.x;
+  unsigned y = blockIdx.y*BLOCKWIDTH + threadIdx.y;
+  if ( x >= numCols ||
+       y >= numRows )
+  {
+      return;
+  }
+  unsigned idx = y*numCols + x;
+  redChannel[idx]   = inputImageRGBA[idx].x;
+  greenChannel[idx] = inputImageRGBA[idx].y;
+  blueChannel[idx]  = inputImageRGBA[idx].z;
+
 }
 
 //This kernel takes in three color channels and recombines them
@@ -198,46 +240,52 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
   checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
   checkCudaErrors(cudaMalloc(&d_blue,  sizeof(unsigned char) * numRowsImage * numColsImage));
 
-  //TODO:
+  //TODO: OK
   //Allocate memory for the filter on the GPU
   //Use the pointer d_filter that we have already declared for you
   //You need to allocate memory for the filter with cudaMalloc
   //be sure to use checkCudaErrors like the above examples to
   //be able to tell if anything goes wrong
   //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
-
-  //TODO:
+  checkCudaErrors(cudaMalloc(&d_filter,  sizeof(float) * filterWidth * filterWidth));
+  //TODO: OK
   //Copy the filter on the host (h_filter) to the memory you just allocated
   //on the GPU.  cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
   //Remember to use checkCudaErrors!
-
+  checkCudaErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));
 }
 
 void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
                         uchar4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
-                        unsigned char *d_redBlurred, 
-                        unsigned char *d_greenBlurred, 
+                        unsigned char *d_redBlurred,
+                        unsigned char *d_greenBlurred,
                         unsigned char *d_blueBlurred,
                         const int filterWidth)
 {
-  //TODO: Set reasonable block size (i.e., number of threads per block)
-  const dim3 blockSize;
+  //TODO: OK Set reasonable block size (i.e., number of threads per block)
+  const dim3 blockSize(BLOCKWIDTH, BLOCKWIDTH, 1);
 
-  //TODO:
+  //TODO: OK
   //Compute correct grid size (i.e., number of blocks per kernel launch)
   //from the image size and and block size.
-  const dim3 gridSize;
-
-  //TODO: Launch a kernel for separating the RGBA image into different color channels
+  const dim3 gridSize(numCols/BLOCKWIDTH + 1, numRows/BLOCKWIDTH + 1, 1);
 
+  //TODO: OK Launch a kernel for separating the RGBA image into different color channels
   // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
-  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  separateChannels<<<gridSize, blockSize>>>(d_inputImageRGBA, numRows, numCols,
+                                            d_red, d_green, d_blue);
+  cudaDeviceSynchronize();
+  checkCudaErrors(cudaGetLastError());
 
   //TODO: Call your convolution kernel here 3 times, once for each color channel.
-
   // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
-  // launching your kernel to make sure that you didn't make any mistakes.
+  // launching your kernel to make sure that you didn't make any mistakes
+  gaussian_blur<<<gridSize, blockSize>>>(d_red, d_redBlurred, numRows, numCols, d_filter, filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  gaussian_blur<<<gridSize, blockSize>>>(d_green, d_greenBlurred, numRows, numCols, d_filter, filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  gaussian_blur<<<gridSize, blockSize>>>(d_blue, d_blueBlurred, numRows, numCols, d_filter, filterWidth);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 
   // Now we recombine your results. We take care of launching this kernel for you.
-Original file line number
+Diff line change
@@ @@ -22,3 +22,11 @@ @@
     build
     bin
+    *png
+    *jpg
+    *\#*
+    *\~*
+    CMakeCache.txt
+    CMakeFiles
+    HW1/HW1
+    HW2/HW2