kernel.cu


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#include "kernel.cuh"

__global__ static void vector_multiplication(float* vectors, float* matrix, float* output) {
    //[i*n+j] to give the element - by row and col number
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            output[4*i+y] = vectors[4*i+y] * matrix[y * 4 + x];
        }
    }
}

__global__ static void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t vectorMultiCuda(float* h_vectors, int h_vector_size, float* h_matrix, float* h_output)
{

    float* d_vectors = new float[h_vector_size * 4];
    float* d_matrix = new float[16];
    float* d_output = new float[h_vector_size * 4];

    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    int numOfDevices;
    cudaGetDeviceCount(&numOfDevices);
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&d_vectors, h_vector_size * 4 * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_output, h_vector_size * 4 * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_matrix, 16 * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(d_vectors, h_vectors, h_vector_size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(d_output, h_output, h_vector_size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(d_matrix, h_matrix, 16 * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    dim3 sizeOfBlock(1000);
    dim3 sizeOfGrid(100);

    // Launch a kernel on the GPU with one thread for each element.
    vector_multiplication <<< sizeOfGrid, sizeOfBlock >>> (d_vectors, d_matrix, d_output);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(h_output, d_output, h_vector_size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(d_output);
    cudaFree(d_vectors);
    cudaFree(d_matrix);

    return cudaStatus;
}

	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"
	#include <stdio.h>

	#include "kernel.cuh"

	__global__ static void vector_multiplication(float* vectors, float* matrix, float* output) {
	//[i*n+j] to give the element - by row and col number
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	for (int y = 0; y < 4; y++) {
	for (int x = 0; x < 4; x++) {
	output[4i+y] = vectors[4i+y] * matrix[y * 4 + x];
	}
	}
	}

	__global__ static void addKernel(int c, const int a, const int *b)
	{
	int i = threadIdx.x;
	c[i] = a[i] + b[i];
	}

	// Helper function for using CUDA to add vectors in parallel.
	cudaError_t addWithCuda(int c, const int a, const int *b, unsigned int size)
	{
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;
	cudaError_t cudaStatus;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
	goto Error;
	}

	// Allocate GPU buffers for three vectors (two input, one output) .
	cudaStatus = cudaMalloc((void*)&dev_c, size sizeof(int));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	cudaStatus = cudaMalloc((void*)&dev_a, size sizeof(int));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	cudaStatus = cudaMalloc((void*)&dev_b, size sizeof(int));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	// Launch a kernel on the GPU with one thread for each element.
	addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

	// Check for any errors launching the kernel
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	goto Error;
	}

	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	goto Error;
	}

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	Error:
	cudaFree(dev_c);
	cudaFree(dev_a);
	cudaFree(dev_b);

	return cudaStatus;
	}

	// Helper function for using CUDA to add vectors in parallel.
	cudaError_t vectorMultiCuda(float* h_vectors, int h_vector_size, float* h_matrix, float* h_output)
	{

	float* d_vectors = new float[h_vector_size * 4];
	float* d_matrix = new float[16];
	float* d_output = new float[h_vector_size * 4];

	cudaError_t cudaStatus;

	// Choose which GPU to run on, change this on a multi-GPU system.
	int numOfDevices;
	cudaGetDeviceCount(&numOfDevices);
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
	goto Error;
	}

	// Allocate GPU buffers for three vectors (two input, one output) .
	cudaStatus = cudaMalloc((void*)&d_vectors, h_vector_size 4 * sizeof(float));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	cudaStatus = cudaMalloc((void*)&d_output, h_vector_size 4 * sizeof(float));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	cudaStatus = cudaMalloc((void*)&d_matrix, 16 sizeof(float));
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
	}

	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(d_vectors, h_vectors, h_vector_size * sizeof(float), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	cudaStatus = cudaMemcpy(d_output, h_output, h_vector_size * sizeof(float), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	cudaStatus = cudaMemcpy(d_matrix, h_matrix, 16 * sizeof(float), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	dim3 sizeOfBlock(1000);
	dim3 sizeOfGrid(100);

	// Launch a kernel on the GPU with one thread for each element.
	vector_multiplication <<< sizeOfGrid, sizeOfBlock >>> (d_vectors, d_matrix, d_output);

	// Check for any errors launching the kernel
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	goto Error;
	}

	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	goto Error;
	}

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(h_output, d_output, h_vector_size * sizeof(float), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
	}

	Error:
	cudaFree(d_output);
	cudaFree(d_vectors);
	cudaFree(d_matrix);

	return cudaStatus;
	}