#ifdef CHECK
#include "../kernelcheck.h"
#endif

#ifdef NUR_OPTIONEN
#define OPTIMIERUNG 5
#else

__kernel void hello(__global const float8 *A,__global const float8 *B,
		    __global float8 *C,const int X,const int Y)
{
 int x0=get_local_id(0);
 const int TPB=get_local_size(0); //Threads Per Block
 int x=get_group_id(0)*TPB+x0; //gleiches Resultat wie get_global_id(0)
 __local float8 Blocal[1024]; //lokaler Speicher (max.48KB)
 float8 sum={0,0,0,0,0,0,0,0};
 const int X8=(X+7)/8;
 const int Y8=Y/8;
 int y;
 for(y=0;y<Y8-TPB;y+=TPB)
  {
   Blocal[x0]=B[y+x0];
   barrier(CLK_LOCAL_MEM_FENCE);
   if(x<X8)
    for(int yb=0;yb<TPB;yb++)
    {
     sum += Blocal[yb].s0*A[x];
     sum += Blocal[yb].s1*A[x];
     sum += Blocal[yb].s2*A[x];
     sum += Blocal[yb].s3*A[x];
     sum += Blocal[yb].s4*A[x];
     sum += Blocal[yb].s5*A[x];
     sum += Blocal[yb].s6*A[x];
     sum += Blocal[yb].s7*A[x];
    }
   barrier(CLK_LOCAL_MEM_FENCE);
  }
 if(y+x0<(Y+7)/8) Blocal[x0]=B[y+x0];
 barrier(CLK_LOCAL_MEM_FENCE);
 if(x<X8)
 {
  for(int yb=0;y+yb<Y8;yb++)
   {
    sum += Blocal[yb].s0*A[x];
    sum += Blocal[yb].s1*A[x];
    sum += Blocal[yb].s2*A[x];
    sum += Blocal[yb].s3*A[x];
    sum += Blocal[yb].s4*A[x];
    sum += Blocal[yb].s5*A[x];
    sum += Blocal[yb].s6*A[x];
    sum += Blocal[yb].s7*A[x];
   }
  switch(Y%8)
   {
    case 7: sum += B[Y8].s6*A[x];
    case 6: sum += B[Y8].s5*A[x];
    case 5: sum += B[Y8].s4*A[x];
    case 4: sum += B[Y8].s3*A[x];
    case 3: sum += B[Y8].s2*A[x];
    case 2: sum += B[Y8].s1*A[x];
    case 1: sum += B[Y8].s0*A[x];
   }
  C[x] = sum;
 }
}

#endif
