/*
* Copyright (C) 2008, STMicroelectronics, Incorporated.
* All rights reserved.
*
* THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT
* WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT
* NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
* FITNESS FOR A PARTICULAR PURPOSE.
*
* matmul, strip mine k loop, use shared memory, strip size 64, unrolled by 4
* run with mm -bin k2-64x4.bin -block (n/64) n -thread 64 -mat n
*/
extern "C" __global__ void
mmkernel( float* a, float* b, float* c,
int pitch_a, int pitch_b, int pitch_c,
int n, int m, int p )
{
int tx = threadIdx.x;
int i = blockIdx.x*64 + tx;
int j = blockIdx.y;
__shared__ float cb[64];
float sum = 0.0;
for( int ks = 0; ks < p; ks += 64 ){
cb[tx] = c[ks+tx+pitch_c*j];
__syncthreads();
for( int k = ks; k < ks+64; k += 4 ){
sum += b[i+pitch_b*k] * cb[k-ks];
sum += b[i+pitch_b*(k+1)] * cb[(k+1)-ks];
sum += b[i+pitch_b*(k+2)] * cb[(k+2)-ks];
sum += b[i+pitch_b*(k+3)] * cb[(k+3)-ks];
}
__syncthreads();
}
a[i+pitch_a*j] = sum;
}