#include "atlas_misc.h"
#include "atlas_amm.h"
#include Mstr(Mjoin(AMM_PRE,_sum.h))
/*
 * This routine handles M <= MAXM && N <= MAXN && very long K, or
 * the inner-product GEMM form.  It appears in the GEMM-based SYRK, which
 * is important for Cholesky.  It is typically the worst-case for ATLAS,
 * since the copy of A and B are of the same order as the computation.
 * It is a minimal workspace routine.
 */
int Mjoin(PATL,ammm_IP)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   ATL_CSZT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CSZT lda,
   const TYPE *B,
   ATL_CSZT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CSZT ldc
)
{
   #ifdef TCPLX
      const TYPE ONE[2]={ATL_rone,ATL_rzero}, ZERO[2]={ATL_rzero,ATL_rzero};
      const TYPE *alpA=ONE, *alpB=ONE, *alpC=ONE;
   #else
      #define ONE ATL_rone
      TYPE alpA=ATL_rone, alpB=ATL_rone, alpC=ATL_rone;
   #endif
   TYPE *a, *b, *rC, *iC;
   void *vp;
   size_t sz, incAk, incBk;
   ATL_INT lenA, lenB, lenC;
   int mu, nu, ku, nmu, nnu, kb, MB, NB, KB, kb0, KB0;
   ATL_INT nfkblks;
   amminfo_t mminf;


   mu = Mjoin(PATL,GetAmmmInfo)(&mminf, TA, TB, M, N, K, alpha, beta);
   if (!mu)
      alpA = alpha;
   else if (mu == 1)
      alpB = alpha;
   else
      alpC = alpha;

   mu = mminf.mu;
   nu = mminf.nu;
   ku = mminf.ku;
   nmu = (M+mu-1)/mu;
   nnu = (N+nu-1)/nu;
   KB = kb = mminf.kb;
   nfkblks = K / kb;
   MB = nmu * mu;
   NB = nnu * nu;
   kb0 = K - nfkblks*kb;
   if (!kb0)
   {
      kb0 = KB0 = kb;
      nfkblks--;
   }
   else
   {
      #if ATL_AMM_MAXKMAJ > 1
         if (ATL_AMMFLG_KMAJOR(mminf.flag))
         {
            KB0 = ((kb0+ku-1)/ku)*ku;
            KB = Mmax(KB, KB0);
         }
         else
      #endif
      KB0 = kb0;
   }
   lenA = MB*KB;
   lenB = KB*NB;
   lenC = MB*NB;
   sz = lenA + lenB + lenC + 2*mu*nu;
   sz = ATL_MulBySize(sz) + 4*ATL_Cachelen;
   vp = malloc(sz);
   ATL_assert(vp);
   a = ATL_AlignPtr(vp);
   b = a + (lenA SHIFT);
   b = ATL_AlignPtr(b);
   iC = b + (lenB SHIFT);
   iC = ATL_AlignPtr(iC);
   #ifdef TCPLX
      rC = iC + lenC;
      rC = ATL_AlignPtr(rC);
   #else
      rC = iC;
   #endif
   incAk = (IS_COLMAJ(TA)) ? lda*kb : kb;
   incBk = (IS_COLMAJ(TB)) ? kb : kb*ldb;
   #ifdef TCPLX
      incAk += incAk;
      incBk += incBk;
   #endif
   Mjoin(PATL,ammmK)(&mminf, M, nmu, N, nnu, nfkblks, kb, kb0, KB0, A, lda,
                     incAk, B, ldb, incBk, mminf.Cblk2cm, C, ldc, a, 0, b, 0,
                     rC, iC, alpA, alpB, alpC, beta);
   free(vp);
   return(0);
}
