#include "atlas_misc.h"
#include "atlas_amm.h"
#include Mstr(Mjoin(AMM_PRE,_sum.h))

void Mjoin(PATL,ammmK)
(
   amminfo_t *mminf,
   const int mb,           /* # of rows of C to compute */
   const int nmu,          /* CEIL(mb/mu) */
   const int nb,
   const int nnu,
   ATL_CINT nfkblks, /* FLOOR(K/kb) */
   const int kb,
   const int kb0,
   const int KB0,
   const TYPE *A,
   const size_t lda,
   const size_t incAk, /* 0: no need to copy A, else incK for cpying A */
   const TYPE *B,
   const size_t ldb,
   const size_t incBk, /* 0: no need to copy B, else incK for cpying A */
   const ablk2cmat_t blk2c,
   TYPE *C,
   const size_t ldc,
   TYPE *a,
   ATL_CINT inca,      /* size of blocks of A, or 0 to reuse space */
   TYPE *b,
   ATL_CINT incb,      /* size of blocks of B, or 0 to reuse space */
   TYPE *rC,
   TYPE *iC,
   const SCALAR alpA,
   const SCALAR alpB,
   const SCALAR alpC,
   const SCALAR beta
)
{
   cm2am_t a2blk=mminf->a2blk, b2blk=mminf->b2blk;
   ammkern_t amm_b0=mminf->amm_b0, amm_b1=mminf->amm_b1, amm_bn=mminf->amm_bn;
   ATL_CINT inca2=inca+inca, incb2=incb+incb;
   ATL_INT szA = inca, szB = incb;
   TYPE *iA=a, *iB=b, *rA, *rB;
   TYPE *an=iA+inca2, *bn=iB+incb2;
   ATL_INT k;
   if (!szA)
      szA = nmu*mminf->mu*Mmax(kb,KB0);
   if (!szB)
      szB = nnu*mminf->nu*Mmax(kb,KB0);
   rA = iA + szA;
   rB = iB + szB;
/*
 * Peel first iteration to handle KR and use beta=0
 */
   if (kb0 != kb)
   {
      amm_b0 = mminf->amm_k1_b0;
      if (ATL_AMMFLG_KRUNTIME(mminf->flag) && (mminf->kbmin <= KB0))
      {
         int ku = mminf->ku;
         #if ATL_AMM_MAXKMAJ > 1
         if (ATL_AMMFLG_KMAJOR(mminf->flag))
            amm_b0 = mminf->amm_b0;
         else
         #endif
         if ((kb0/ku)*ku == kb0)
            amm_b0 = mminf->amm_b0;
      }
      if (amm_b0 != mminf->amm_b0)
      {
         amm_b1 = mminf->amm_k1_b1;
         amm_bn = mminf->amm_k1_bn;
      }
      if (incAk)
         a2blk(kb0, mb, alpA, A+incAk*nfkblks, lda, rA, iA);
      if (incBk)
         b2blk(kb0, nb, alpB, B+incBk*nfkblks, ldb, rB, iB);

      amm_b0(nmu, nnu, KB0, iA, iB, rC, rA, iB, iC);
      amm_b0(nmu, nnu, KB0, rA, iB, iC, rA, rB, rC);
      amm_bn(nmu, nnu, KB0, rA, rB, rC, iA, rB, iC);
      amm_b1(nmu, nnu, KB0, iA, rB, iC, an, bn, rC);

      amm_b1 = mminf->amm_b1;
      amm_bn = mminf->amm_bn;
   }
   else
   {
      if (incAk)
      {
         a2blk(kb, mb, alpA, A, lda, rA, iA);
         A += incAk;
      }
      if (incBk)
      {
         b2blk(kb, nb, alpB, B, ldb, rB, iB);
         B += incBk;
      }
      amm_b0(nmu, nnu, kb, iA, iB, rC, rA, iB, iC);
      amm_b0(nmu, nnu, kb, rA, iB, iC, rA, rB, rC);
      amm_bn(nmu, nnu, kb, rA, rB, rC, iA, rB, iC);
      amm_b1(nmu, nnu, kb, iA, rB, iC, an, bn, rC);
   }

   for (k=0; k < nfkblks; k++)
   {
      iA = an;
      iB = bn;
      rA = iA + szA;
      rB = iB + szB;
      an = iA + inca2;
      bn = iB + incb2;
      if (incAk)
      {
         a2blk(kb, mb, alpA, A, lda, rA, iA);
         A += incAk;
      }
      if (incBk)
      {
         b2blk(kb, nb, alpB, B, ldb, rB, iB);
         B += incBk;
      }
      amm_bn(nmu, nnu, kb, iA, iB, rC, rA, iB, iC);
      amm_b1(nmu, nnu, kb, rA, iB, iC, rA, rB, rC);
      amm_bn(nmu, nnu, kb, rA, rB, rC, iA, rB, iC);
      amm_b1(nmu, nnu, kb, iA, rB, iC, an, bn, rC);
   }
   if (blk2c)
      blk2c(mb, nb, alpC, rC, iC, beta, C, ldc);
}
