/*
* Copyright (C) 2008, STMicroelectronics, Incorporated.
* All rights reserved.
*
* THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT
* WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT
* NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
* FITNESS FOR A PARTICULAR PURPOSE.
*
* Stupid matmul
* run with mm -bin k0.bin -block (n/32) n -thread 32 -mat n
*/
extern "C" __global__ void
mmkernel( float* a, float* b, float* c,
int pitch_a, int pitch_b, int pitch_c,
int n, int m, int p )
{
int j = blockIdx.x*32 + threadIdx.x;
int i = blockIdx.y;
float sum = 0.0;
for( int k = 0; k < p; ++k )
sum += b[i+pitch_b*k] * c[k+pitch_c*j];
a[i+pitch_a*j] = sum;
}