/*
* Copyright (C) 2008, STMicroelectronics, Incorporated.
* All rights reserved.
*
* THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT
* WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT
* NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
* FITNESS FOR A PARTICULAR PURPOSE.
*
* matmul, strip mine k loop, use shared memory, strip size 64, unrolled by 16
* run with mm -bin k2-64x16.bin -block (n/64) n -thread 64 -mat n
*/
extern "C" __global__ void
mmkernel( float* a, float* b, float* c,
int pitch_a, int pitch_b, int pitch_c,
int n, int m, int p )
{
int tx = threadIdx.x;
int i = blockIdx.x*64 + tx;
int j = blockIdx.y;
__shared__ float cb[64];
float sum = 0.0;
for( int ks = 0; ks < p; ks += 64 ){
cb[tx] = c[ks+tx+pitch_c*j];
__syncthreads();
for( int k = ks; k < ks+64; k += 16 ){
sum += b[i+pitch_b*k] * cb[k-ks];
sum += b[i+pitch_b*(k+1)] * cb[(k+1)-ks];
sum += b[i+pitch_b*(k+2)] * cb[(k+2)-ks];
sum += b[i+pitch_b*(k+3)] * cb[(k+3)-ks];
sum += b[i+pitch_b*(k+4)] * cb[(k+4)-ks];
sum += b[i+pitch_b*(k+5)] * cb[(k+5)-ks];
sum += b[i+pitch_b*(k+6)] * cb[(k+6)-ks];
sum += b[i+pitch_b*(k+7)] * cb[(k+7)-ks];
sum += b[i+pitch_b*(k+8)] * cb[(k+8)-ks];
sum += b[i+pitch_b*(k+9)] * cb[(k+9)-ks];
sum += b[i+pitch_b*(k+10)] * cb[(k+10)-ks];
sum += b[i+pitch_b*(k+11)] * cb[(k+11)-ks];
sum += b[i+pitch_b*(k+12)] * cb[(k+12)-ks];
sum += b[i+pitch_b*(k+13)] * cb[(k+13)-ks];
sum += b[i+pitch_b*(k+14)] * cb[(k+14)-ks];
sum += b[i+pitch_b*(k+15)] * cb[(k+15)-ks];
}
__syncthreads();
}
a[i+pitch_a*j] = sum;
}