Implement nxn matrix parallel multiplication using CUDA/OpenCL GPU, use shared
memory
#include<stdio.h>
#includelt;math.h>
#define TILE_WIDTH 2
/*matrix multiplication kernels*/
//non shared
__global__ void
MatrixMul( int *Md , int *Nd , int *Pd , const int WIDTH )
{
// calculate thread id
unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
Pd[row*WIDTH + col]=0;
for (int k = 0 ; k<WIDTH ; k++ )
{
Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
}
}
// main routine
int main ()
{
const int WIDTH = 2 ;
int array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH],result_array_h[WIDTH][WIDTH];
int *array1_d , *array2_d ,*result_array_d ; // device array
int i , j ;
//input in host array
printf("Enter matrix1\n");
for ( i = 0 ; i < WIDTH ; i++ )
{
for (j = 0 ; j < WIDTH ; j++ )
{
scanf("%d",&array1_h[i][j]);
}
}
printf("Enter matrix2\n");
for ( i = 0 ; i < WIDTH ; i++ )
{
for (j = 0 ; j < WIDTH ; j++ )
{
scanf("%d",&array2_h[i][j]);
}
}
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (int) ) ;
cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (int) ) ;
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultent device array
cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (int) ) ;
//calling kernal
dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;
dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,result_array_d , WIDTH) ;
cudaMemcpy(result_array_h , result_array_d , WIDTH*WIDTH*sizeof(int) ,cudaMemcpyDeviceToHost) ;
//printf the result array
printf("matrix 1\n");
for ( i = 0 ; i < WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf ("%d ",array1_h[i][j] ) ;
}
printf ("\n") ;
}
printf("matrix 2\n");
for ( i = 0 ; i < WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf ("%d ",array2_h[i][j] ) ;
}
printf ("\n") ;
}
printf("matrix result\n");
for ( i = 0 ; i < WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf ("%d ",result_array_h[i][j] ) ;
}
printf ("\n") ;
}
system("pause") ;
}
/*
output:
Enter matrix1
1 1 1 1
Enter matrix2
1 1 1 1
matrix 1
1 1
1 1
matrix 2
1 1
1 1
matrix result
2 2
2 2
*/
No comments:
Post a Comment