Skip to content
This repository was archived by the owner on Jun 27, 2022. It is now read-only.

inline referencecalculation. #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -22,3 +22,11 @@
build
bin

*png
*jpg
*\#*
*\~*
CMakeCache.txt
CMakeFiles
HW1/HW1
HW2/HW2
2 changes: 1 addition & 1 deletion HW1/Makefile
Original file line number Diff line number Diff line change
@@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include

OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui

CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
CUDA_INCLUDEPATH=/usr/local/cuda-5.5/include

######################################################
# On Macs the default install locations are below #
2 changes: 1 addition & 1 deletion HW1/reference_calc.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// for uchar4 struct
#include <cuda_runtime.h>

void referenceCalculation(const uchar4* const rgbaImage,
inline void referenceCalculation(const uchar4* const rgbaImage,
unsigned char *const greyImage,
size_t numRows,
size_t numCols)
25 changes: 22 additions & 3 deletions HW1/student_func.cu
Original file line number Diff line number Diff line change
@@ -31,7 +31,9 @@
//You should fill in the kernel as well as set the block and grid sizes
//so that the entire image is processed.

#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
@@ -50,17 +52,34 @@ void rgba_to_greyscale(const uchar4* const rgbaImage,
//First create a mapping from the 2D block and grid locations
//to an absolute 2D location in the image, then use that to
//calculate a 1D offset
unsigned x = threadIdx.x*32 + blockIdx.x;
unsigned y = threadIdx.y*32 + blockIdx.y;

if (x >= numCols || y >= numRows) {
// printf("Out of bound: %d %d (limits are %d %d) \n", x, y, numCols, numRows);
return;
}
unsigned pixel_idx = y*numCols + x;
const uchar4 rgba = rgbaImage[pixel_idx];
greyImage[pixel_idx] = 0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z;
//printf("%d %d: %d %d %d --> %d \n", x, y, rgba.x, rgba.y, rgba.z, greyImage[pixel_idx]);
}

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(1, 1, 1); //TODO
const dim3 gridSize( 1, 1, 1); //TODO
unsigned num_pixels = numRows*numCols;
unsigned bw_size = sizeof(unsigned char)*num_pixels;
unsigned color_size = sizeof(uchar4)*num_pixels;
//cudaMalloc((void**)&d_rgbaImage, color_size);
//cudaMalloc((void**)&d_greyImage, bw_size);
//cudaMemcpy(d_rgbaImage, h_rgbaImage, color_size, cudaMemcpyHostToDevice);
const dim3 blockSize(numCols/32 + 1, numRows/32 + 1, 1); //TODO
const dim3 gridSize(32, 32, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);

cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
//cudaMemcpy(&h_greyImage, &d_greyImage, bw_size, cudaMemcpyDeviceToHost);

}
2 changes: 2 additions & 0 deletions HW2/HW2.cpp
Original file line number Diff line number Diff line change
@@ -110,6 +110,8 @@ void postProcess(const std::string& output_file, uchar4* data_ptr) {
cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
//output the image
cv::imwrite(output_file.c_str(), imageOutputBGR);
// cv::imshow("Blurr image", imageOutputBGR);
// cv::waitKey(0);
}

void cleanUp(void)
2 changes: 1 addition & 1 deletion HW2/Makefile
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ OPENCV_INCLUDEPATH=/usr/include
#OPENCV_INCLUDEPATH=/usr/local/include

OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
CUDA_INCLUDEPATH=/usr/local/cuda-5.5/include

######################################################
# On Macs the default install locations are below #
96 changes: 72 additions & 24 deletions HW2/student_func.cu
Original file line number Diff line number Diff line change
@@ -101,28 +101,58 @@
//****************************************************************************

#include "utils.h"
#include "stdio.h"

#define BLOCKWIDTH 32

__global__
void gaussian_blur(const unsigned char* const inputChannel,
unsigned char* const outputChannel,
int numRows, int numCols,
const float* const filter, const int filterWidth)
{
const float* const filter, const int filterWidth) {
// TODO

unsigned x = blockIdx.x*BLOCKWIDTH + threadIdx.x;
unsigned y = blockIdx.y*BLOCKWIDTH + threadIdx.y;
// NOTE: Be sure to compute any intermediate results in floating point
// before storing the final result as unsigned char.

// NOTE: Be careful not to try to access memory that is outside the bounds of
// the image. You'll want code that performs the following check before accessing
// GPU memory:
//
// if ( absolute_image_position_x >= numCols ||
// absolute_image_position_y >= numRows )
// {
// return;
// }

if ( x >= numCols ||
y >= numRows )
{
return;
}
unsigned idx = y*numCols + x;
// outputChannel[idx] = inputChannel[idx];
float output(0);
unsigned center = (filterWidth + 1)/2;
for (size_t i = 0; i < filterWidth; ++i) {
for (size_t j = 0; j < filterWidth; ++j) {
int offset_x = i - center;
int offset_y = j - center;
int curr_x = x - offset_x;
int curr_y = y - offset_y;
if (curr_x < 0) {
curr_x = 0;
}
if (curr_x >= numCols) {
curr_x = numCols - 1;
}
if (curr_y < 0) {
curr_y = 0;
}
if (curr_y >= numRows) {
curr_y = numRows - 1;
}
unsigned curr_idx = curr_y*numCols + curr_x;
output += filter[j*filterWidth + i] * inputChannel[curr_idx];
}
}
//printf("idx: %d, output: %f\n", idx, output);
outputChannel[idx] = output;
// NOTE: If a thread's absolute position 2D position is within the image, but some of
// its neighbors are outside the image, then you will need to be extra careful. Instead
// of trying to read such a neighbor value from GPU memory (which won't work because
@@ -152,6 +182,18 @@ void separateChannels(const uchar4* const inputImageRGBA,
// {
// return;
// }
unsigned x = blockIdx.x*BLOCKWIDTH + threadIdx.x;
unsigned y = blockIdx.y*BLOCKWIDTH + threadIdx.y;
if ( x >= numCols ||
y >= numRows )
{
return;
}
unsigned idx = y*numCols + x;
redChannel[idx] = inputImageRGBA[idx].x;
greenChannel[idx] = inputImageRGBA[idx].y;
blueChannel[idx] = inputImageRGBA[idx].z;

}

//This kernel takes in three color channels and recombines them
@@ -198,46 +240,52 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage));

//TODO:
//TODO: OK
//Allocate memory for the filter on the GPU
//Use the pointer d_filter that we have already declared for you
//You need to allocate memory for the filter with cudaMalloc
//be sure to use checkCudaErrors like the above examples to
//be able to tell if anything goes wrong
//IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc

//TODO:
checkCudaErrors(cudaMalloc(&d_filter, sizeof(float) * filterWidth * filterWidth));
//TODO: OK
//Copy the filter on the host (h_filter) to the memory you just allocated
//on the GPU. cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
//Remember to use checkCudaErrors!

checkCudaErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));
}

void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
uchar4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
unsigned char *d_redBlurred,
unsigned char *d_greenBlurred,
unsigned char *d_redBlurred,
unsigned char *d_greenBlurred,
unsigned char *d_blueBlurred,
const int filterWidth)
{
//TODO: Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize;
//TODO: OK Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize(BLOCKWIDTH, BLOCKWIDTH, 1);

//TODO:
//TODO: OK
//Compute correct grid size (i.e., number of blocks per kernel launch)
//from the image size and and block size.
const dim3 gridSize;

//TODO: Launch a kernel for separating the RGBA image into different color channels
const dim3 gridSize(numCols/BLOCKWIDTH + 1, numRows/BLOCKWIDTH + 1, 1);

//TODO: OK Launch a kernel for separating the RGBA image into different color channels
// Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
separateChannels<<<gridSize, blockSize>>>(d_inputImageRGBA, numRows, numCols,
d_red, d_green, d_blue);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());

//TODO: Call your convolution kernel here 3 times, once for each color channel.

// Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
// launching your kernel to make sure that you didn't make any mistakes
gaussian_blur<<<gridSize, blockSize>>>(d_red, d_redBlurred, numRows, numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
gaussian_blur<<<gridSize, blockSize>>>(d_green, d_greenBlurred, numRows, numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
gaussian_blur<<<gridSize, blockSize>>>(d_blue, d_blueBlurred, numRows, numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

// Now we recombine your results. We take care of launching this kernel for you.