#include "atlas_misc.h"
#include "atlas_amm.h"
#include Mstr(Mjoin(AMM_PRE,_sum.h))

int Mjoin(PATL,ammm_syrk)
(
   const enum ATLAS_UPLO  Uplo,
   const enum ATLAS_TRANS TA,
   ATL_CSZT  N,
   ATL_CSZT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CSZT lda,
   const SCALAR beta,
   TYPE *C,
   ATL_CSZT ldc
)
{
   ablk2cmat_t blk2c, blk2c_b0;
   ATL_CSZT ldcp1=(ldc+1)SHIFT;
   ATL_SZT i, j, szA, pansz, sz, nnblks, nkblks, incCn, incAk, incAn, incAnF;
   const TYPE *B = A;
   const TYPE *a;
   TYPE *wa, *ar, *wb, *wc, *wC, *c;
   void *vp=NULL;
   const int ISHERK=(TA == AtlasConj || TA == AtlasConjTrans);
   int ibet=1, ialp=1;
   int mu, nu, ku, incw, nb, nnu, nmu, nbF, nnuF, nmuF, kb0, KB0;
   #ifdef TCPLX
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
      const TYPE ZERO[2] = {ATL_rzero, ATL_rzero};
      TYPE *rC, *CC=C;
      int incw2, nb2;
   #else
      #define ONE ATL_rone
      #define ZERO ATL_rzero
      #define rC wc
      #define incw2 incw
      #define nb2 nb
   #endif
   amminfo_t mminf;


//   printf("UP=%c, N=%d, K=%d\n", Uplo==AtlasLower?'L':'U', (int)N, (int)K);
   if (SCALAR_IS_NONE(alpha))
      ialp = -1;
   else if (!SCALAR_IS_ONE(alpha))
      ialp = 2;
   if (SCALAR_IS_ZERO(beta))
      ibet = 0;
   else if (SCALAR_IS_NONE(beta))
      ibet = -1;
   else if (!SCALAR_IS_ONE(beta))
      ibet = 2;

   nb = Mjoin(PATL,GetSyrkInfo)(&mminf, ialp, TA, N, K, ibet);
   blk2c = mminf.Cblk2cm;
   blk2c_b0 = mminf.Cblk2cm_b1;
   mu = mminf.mu;
   nu = mminf.nu;
   ku = mminf.ku;
   nmu = (nb+mu-1)/mu;
   nnu = (nb+nu-1)/nu;
   nnblks = (N+nb-1)/nb;
   nkblks = (K+nb-1)/nb;
   incw = nb*nb;
   #ifdef TCPLX
      nb2 = nb + nb;
      incw2 = incw + incw;
   #endif
   pansz = nkblks*incw;
   szA = Mmax(nnblks-1,1);
   szA *= pansz;
   sz = szA + pansz + incw + incw + (mu+mu)*nu;
   sz = ATL_MulBySize(sz) + ATL_Cachelen;
   if (sz <= ATL_MaxMalloc)
      vp = malloc(sz);
   if (!vp)
      return(1);
   #ifdef TCPLX
      pansz += pansz;
   #endif
   wa = ATL_AlignPtr(vp);
   wb = wa + (szA SHIFT);
   wc = wb + pansz;
   wC = wc + incw2;

   nbF = N - (--nnblks)*nb;
   if (nbF == nb)
   {
      nmuF = nmu;
      nnuF = nnu;
   }
   else
   {
      nmuF = (nbF+mu-1)/mu;
      nnuF = (nbF+nu-1)/nu;
   }
   KB0 = kb0 = K - (--nkblks)*nb;
   #if ATL_AMM_MAXKMAJ > 1
      if (kb0 != nb && ATL_AMMFLG_KMAJOR(mminf.flag))
         KB0 = ((kb0+ku-1)/ku)*ku;
   #endif
   #ifdef TCPLX
      rC = wc + incw;
   #endif
   if (IS_COLMAJ(TA))
   {
      incAk = nb*(lda SHIFT);
      incAn = nb2;
      incAnF = nbF SHIFT;
   }
   else
   {
      incAk = nb2;
      incAn = lda SHIFT;
      incAnF = incAn * nbF;
      incAn *= nb;
   }
   incCn = nb*ldcp1;
   if (Uplo == AtlasLower)
   {
/*
 *    Peel first column of C computation, which will handle all partial blocks
 *    First rowpan of A not reused, so set inca=0
 */
      Mjoin(PATL,ammmK)(&mminf, nbF, nmuF, nbF, nnuF, nkblks, nb, kb0, KB0,
                        A, lda, incAk, B, lda, incAk, blk2c_b0, wC, nb,
                        wa, 0, wb, incw, rC, wc, ONE, alpha, ONE, ZERO);
      Mjoin(PATL,tradd)(Uplo, nbF, wC, nb, beta, C, ldc);
      if (ISHERK)
         Mjoin(PATLU,zero)(nbF, C+1, ldcp1);
      c = C + (nbF SHIFT);
      A += incAnF;
      B += incAnF;
      C += nbF*ldcp1;
      ar = wa;
      for (i=0; i < nnblks; i++, c += nb2, A += incAn, ar += pansz)
         Mjoin(PATL,ammmK)(&mminf, nb, nmu, nbF, nnuF, nkblks, nb, kb0, KB0,
                           A, lda, incAk, B, lda, 0, blk2c, c, ldc,
                           ar, incw, wb, incw, rC, wc, ONE, alpha, ONE, beta);
      for (j=0; j < nnblks; j++, B += incAn, C += incCn, wa += pansz)
      {
         Mjoin(PATL,ammmK)(&mminf, nb, nmu, nb, nnu, nkblks, nb, kb0, KB0,
                           A, lda, 0, B, lda, incAk, blk2c_b0, wC, nb,
                           wa, incw, wb, incw, rC, wc, ONE, alpha, ONE, ZERO);
         Mjoin(PATL,tradd)(Uplo, nb, wC, nb, beta, C, ldc);
         if (ISHERK)
            Mjoin(PATLU,zero)(nb, C+1, ldcp1);
         c = C + nb2;
         ar = wa + pansz;
         for (i=j+1; i < nnblks; i++, c += nb2, ar += pansz)
            Mjoin(PATL,ammmK)(&mminf, nb, nmu, nb, nnu, nkblks, nb, kb0, KB0,
                              A, lda, 0, B, lda, 0, blk2c, c, ldc, ar, incw,
                              wb, incw, rC, wc, ONE, alpha, ONE, beta);
      }
   }
/*
 * Upper runs backwards: start from last (partial) colpan, go left.  This
 * allows col-major access on Upper, avoiding TLB problems on C access.
 * Within the panel, start at diag on bottom and go up.  This allows us to
 * use less A storage, as with lower.
 */
   else /* if (TA == AtlasUpper) */
   {
      C += nnblks*incCn;
      A += nnblks*incAn;
      B = A;
      Mjoin(PATL,ammmK)(&mminf, nbF, nmuF, nbF, nnuF, nkblks, nb, kb0, KB0,
                        A, lda, incAk, B, lda, incAk, blk2c_b0, wC, nb,
                        wa, 0, wb, incw, rC, wc, ONE, alpha, ONE, ZERO);
      Mjoin(PATL,tradd)(Uplo, nbF, wC, nb, beta, C, ldc);
      if (ISHERK)
         Mjoin(PATLU,zero)(nbF, C+1, ldcp1);
      c = C - nb2;
      C -= incCn;
      A -= incAn;
      B -= incAn;
      ar = wa;
      for (i=0; i < nnblks; i++, c -= nb2, A -= incAn, ar += pansz)
         Mjoin(PATL,ammmK)(&mminf, nb, nmu, nbF, nnuF, nkblks, nb, kb0, KB0,
                           A, lda, incAk, B, lda, 0, blk2c, c, ldc,
                           ar, incw, wb, incw, rC, wc, ONE, alpha, ONE, beta);
      for (j=0; j < nnblks; j++, B -= incAn, C -= incCn, wa += pansz)
      {
         Mjoin(PATL,ammmK)(&mminf, nb, nmu, nb, nnu, nkblks, nb, kb0, KB0,
                           A, lda, 0, B, lda, incAk, blk2c_b0, wC, nb,
                           wa, incw, wb, incw, rC, wc, ONE, alpha, ONE, ZERO);
         Mjoin(PATL,tradd)(Uplo, nb, wC, nb, beta, C, ldc);
         if (ISHERK)
            Mjoin(PATLU,zero)(nb, C+1, ldcp1);
         c = C - nb2;
         ar = wa + pansz;
         for (i=j+1; i < nnblks; i++, c -= nb2, ar += pansz)
            Mjoin(PATL,ammmK)(&mminf, nb, nmu, nb, nnu, nkblks, nb, kb0, KB0,
                              A, lda, 0, B, lda, 0, blk2c, c, ldc, ar, incw,
                              wb, incw, rC, wc, ONE, alpha, ONE, beta);
      }
   }
   return(0);
}
#ifndef TCPLX
   #undef ONE
   #undef ZERO
   #undef rC
   #undef incw2
#endif
