Wednesday, 16 March 2016

Implement nxn matrix parallel multiplication using CUDA/OpenCL GPU, use shared memory

Implement nxn matrix parallel multiplication using CUDA/OpenCL GPU, use shared memory

 

#include<stdio.h>

#includelt;math.h>

#define TILE_WIDTH 2

/*matrix multiplication kernels*/

//non shared
__global__ void
MatrixMul( int *Md , int *Nd , int *Pd , const int WIDTH )
{

           // calculate thread id

           unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;

           unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

           Pd[row*WIDTH + col]=0;

         for (int k = 0 ; k<WIDTH ; k++ )
         {
                  Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
          }

}


// main routine
int main ()
{
   const int WIDTH = 2 ;
   int array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH],result_array_h[WIDTH][WIDTH];
  int *array1_d , *array2_d ,*result_array_d ; // device array
  int i , j ;
  //input in host array
  printf("Enter matrix1\n");
  for ( i = 0 ; i < WIDTH ; i++ )
  {
     for (j = 0 ; j < WIDTH ; j++ )
     {
        scanf("%d",&array1_h[i][j]);

     }
  }
  printf("Enter matrix2\n");
    for ( i = 0 ; i < WIDTH ; i++ )
    {
       for (j = 0 ; j < WIDTH ; j++ )
       {
          scanf("%d",&array2_h[i][j]);

       }
    }

  //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;

  cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (int) ) ;

  cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (int) ) ;




  //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )

  cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;

  cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;



  //allocating memory for resultent device array

  cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (int) ) ;





  //calling kernal

  dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;

  dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;
  MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,result_array_d , WIDTH) ;



  cudaMemcpy(result_array_h , result_array_d , WIDTH*WIDTH*sizeof(int) ,cudaMemcpyDeviceToHost) ;

  //printf the result array
  printf("matrix 1\n");
  for ( i = 0 ; i <  WIDTH ; i++ )
   {
       for ( j = 0 ; j < WIDTH ; j++ )
      {
         printf ("%d   ",array1_h[i][j] ) ;
      }
  printf ("\n") ;
 }
  printf("matrix 2\n");
  for ( i = 0 ; i < WIDTH ; i++ )
   {
       for ( j = 0 ; j < WIDTH ; j++ )
      {
         printf ("%d   ",array2_h[i][j] ) ;
      }
  printf ("\n") ;
 }
  printf("matrix result\n");
  for ( i = 0 ; i < WIDTH ; i++ )
  {
      for ( j = 0 ; j < WIDTH ; j++ )
     {
        printf ("%d   ",result_array_h[i][j] ) ;
     }
 printf ("\n") ;
}
 system("pause") ;
}
/*
 output:
 Enter matrix1
1 1 1 1
Enter matrix2

1 1 1 1
matrix 1
1   1
1   1
matrix 2
1   1
1   1
matrix result
2   2
2   2
 */

No comments:

Post a Comment