#ifdef CHECK
#include "kernelcheck.h"
#endif

#ifdef NUR_OPTIONEN
#define OPTIMIERUNG 6
#else

__kernel void hello(__global const float *A,__global const float *B,
		    __global float *C,const int X,const int Y)
{
 int x0=get_local_id(0);
 const int TPB=get_local_size(0); //Threads Per Block
 int x=get_group_id(0)*TPB+x0; //gleiches Resultat wie get_global_id(0)
 __local float Blocal[1024][8]; //lokaler Speicher (max.48KB)
 float Areg[8]; for(int i=0;i<8;i++) if(x*8+i<X) Areg[i]=A[x*8+i];
 float sum[8]; for(int i=0;i<8;i++) sum[i]=0;
 const int X8=(X+7)/8;
 const int Y8=Y/8;
 for(int ya=0;ya<Y8;ya+=TPB)
  {
   for(int i=0;i<8;i++) Blocal[x0][i]=B[(ya+x0)*8+i];
   barrier(CLK_LOCAL_MEM_FENCE);
   if(x<X8)
    for(int yb=0;yb<TPB && ya+yb<Y8;yb++)
      {
       for(int i=0;i<8;i++)
	for(int j=0;j<8;j++)
	 sum[j] += Blocal[yb][i]*Areg[j];
      }
   barrier(CLK_LOCAL_MEM_FENCE);
  }
 for(int y=Y8*8;y<Y;y++)
  for(int j=0;j<8;j++)
   sum[j] += B[y]*Areg[j];
 if(x<X8)
  for(int i=0;i<8;i++)
   C[x*8+i] = sum[i];
}

#endif
