#define ATL_NOAMM 1
#include "atlas_misc.h"
#undef ATL_NOAMM
#include Mstr(Mjoin(ATLAS_PRE,rkamm_kern.h))
#include Mstr(Mjoin(ATLAS_PRE,rkamm_blk.h))
#include Mstr(Mjoin(ATLAS_PRE,rkamm_flag.h))
#include Mstr(Mjoin(ATLAS_PRE,rkamm_perf.h))

int Mjoin(PATL,GetRankKInfo)
(
   amminfo_t *out,
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   ATL_CSZT K,
   const SCALAR alpha,
   const SCALAR beta
)
{
   const int ik = K-3;
   int appAl;  /* 0:A, 1:B */
   ATL_assert(K > 2 && K <= ATL_MAXK_RKK);
   out->IDX = ik;
   out->mb = ATL_AMM_MBs[ik];
   out->nb = ATL_AMM_NBs[ik];
   out->kb = ATL_AMM_KBs[ik];
   #ifdef ATL_CAMM_MAXINDX
      if (ik == ATL_CAMM_MAXINDX)
      {
         out->mb = ATL_CAMM_MAXMB;
         out->nb = ATL_CAMM_MAXNB;
         out->kb = ATL_CAMM_MAXKB;
      }
   #endif
   out->kbmin = ATL_AMM_KBMINs[ik];
   out->mu = ATL_AMM_MUs[ik];
   out->nu = ATL_AMM_NUs[ik];
   out->ku = ATL_AMM_KUs[ik];
   out->flag = ATL_AMM_KFLAG[ik];
   out->amm_b0 = ATL_AMM_KERN_RKK[ik];
   out->amm_b1 = ATL_AMM_KERN_RKK_b1[ik];
   out->amm_bn = ATL_AMM_KERN_RKK_bn[ik];
/*
 * Apply alpha to smallest matrix, and use alpha/beta to pick copy routines
 */
   if (SCALAR_IS_ONE(alpha))
   {
      appAl = 0;
      #ifdef TCPLX
         if (TA == AtlasNoTrans)
            out->a2blk = ATL_RKK_AT2BLK_a1[ik];
         else if (TA == AtlasTrans)
            out->a2blk = ATL_RKK_A2BLK_a1[ik];
         else if (TA == AtlasConjTrans)
            out->a2blk = ATL_RKK_AC2BLK_a1[ik];
         else
            out->a2blk = ATL_RKK_AH2BLK_a1[ik];
         if (TB == AtlasNoTrans)
             out->b2blk = ATL_RKK_B2BLK_a1[ik];
         else if (TB == AtlasTrans)
             out->b2blk = ATL_RKK_BT2BLK_a1[ik];
         else if (TB == AtlasConjTrans)
             out->b2blk = ATL_RKK_BH2BLK_a1[ik];
         else
             out->b2blk = ATL_RKK_BC2BLK_a1[ik];
      #else
         out->a2blk = (TA == AtlasNoTrans) ?
            ATL_RKK_AT2BLK_a1[ik]:ATL_RKK_A2BLK_a1[ik];
         out->b2blk = (TB == AtlasNoTrans) ?
            ATL_RKK_B2BLK_a1[ik]:ATL_RKK_BT2BLK_a1[ik];
      #endif
      if (SCALAR_IS_ONE(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_b1[ik];
      else if (SCALAR_IS_ZERO(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_b0[ik];
      else if (SCALAR_IS_NONE(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_bn[ik];
      else
         out->Cblk2cm = ATL_RKK_BLK2C_a1_bX[ik];
   }
   else  /* alpha is not one */
   {
      appAl = (M >= N) ? 1:0;
      if (SCALAR_IS_ONE(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_b1[ik];
      else if (SCALAR_IS_ZERO(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_b0[ik];
      else if (SCALAR_IS_NONE(beta))
         out->Cblk2cm = ATL_RKK_BLK2C_a1_bn[ik];
      else
         out->Cblk2cm = ATL_RKK_BLK2C_a1_bX[ik];
      if (!appAl)  /* apply to alpha to A */
      {
         #ifdef TCPLX
            if (TB == AtlasNoTrans)
                out->b2blk = ATL_RKK_B2BLK_a1[ik];
            else if (TB == AtlasTrans)
                out->b2blk = ATL_RKK_BT2BLK_a1[ik];
            else if (TB == AtlasConjTrans)
                out->b2blk = ATL_RKK_BH2BLK_a1[ik];
            else
                out->b2blk = ATL_RKK_BC2BLK_a1[ik];
            if (SCALAR_IS_NONE(alpha))
            {
               if (TA == AtlasNoTrans)
                  out->a2blk = ATL_RKK_AT2BLK_an[ik];
               else if (TA == AtlasTrans)
                  out->a2blk = ATL_RKK_A2BLK_an[ik];
               else if (TA == AtlasConjTrans)
                  out->a2blk = ATL_RKK_AC2BLK_an[ik];
               else
                  out->a2blk = ATL_RKK_AH2BLK_an[ik];
            }
            else
            {
               if (TA == AtlasNoTrans)
                  out->a2blk = ATL_RKK_AT2BLK_aX[ik];
               else if (TA == AtlasTrans)
                  out->a2blk = ATL_RKK_A2BLK_aX[ik];
               else if (TA == AtlasConjTrans)
                  out->a2blk = ATL_RKK_AC2BLK_aX[ik];
               else
                  out->a2blk = ATL_RKK_AH2BLK_aX[ik];
            }
         #else
            if (SCALAR_IS_NONE(alpha))
               out->a2blk = (TA == AtlasNoTrans) ?
                            ATL_RKK_AT2BLK_an[ik] : ATL_RKK_A2BLK_an[ik];
            else
               out->a2blk = (TA == AtlasNoTrans) ?
                            ATL_RKK_AT2BLK_aX[ik] : ATL_RKK_A2BLK_aX[ik];
            out->b2blk = (TB == AtlasNoTrans) ?
                         ATL_RKK_B2BLK_a1[ik] : ATL_RKK_BT2BLK_a1[ik];
         #endif
      }
      else /* apply alpha to B */
      {
         #ifdef TCPLX
            if (TA == AtlasNoTrans)
               out->a2blk = ATL_RKK_AT2BLK_a1[ik];
            else if (TA == AtlasTrans)
               out->a2blk = ATL_RKK_A2BLK_a1[ik];
            else if (TA == AtlasConjTrans)
               out->a2blk = ATL_RKK_AC2BLK_a1[ik];
            else
               out->a2blk = ATL_RKK_AH2BLK_a1[ik];
            if (SCALAR_IS_NONE(alpha))
            {
               if (TB == AtlasNoTrans)
                   out->b2blk = ATL_RKK_B2BLK_an[ik];
               else if (TB == AtlasTrans)
                   out->b2blk = ATL_RKK_BT2BLK_an[ik];
               else if (TB == AtlasConjTrans)
                   out->b2blk = ATL_RKK_BH2BLK_an[ik];
               else
                   out->b2blk = ATL_RKK_BC2BLK_an[ik];
            }
            else
            {
               if (TB == AtlasNoTrans)
                   out->b2blk = ATL_RKK_B2BLK_aX[ik];
               else if (TB == AtlasTrans)
                   out->b2blk = ATL_RKK_BT2BLK_aX[ik];
               else if (TB == AtlasConjTrans)
                   out->b2blk = ATL_RKK_BH2BLK_aX[ik];
               else
                   out->b2blk = ATL_RKK_BC2BLK_aX[ik];
            }
         #else
            out->a2blk = (TA == AtlasNoTrans) ?
                         ATL_RKK_AT2BLK_a1[ik] : ATL_RKK_A2BLK_a1[ik];
            if (SCALAR_IS_NONE(alpha))
               out->b2blk = (TB == AtlasNoTrans) ?
                            ATL_RKK_B2BLK_an[ik] : ATL_RKK_BT2BLK_an[ik];
            else
               out->b2blk = (TB == AtlasNoTrans) ?
                            ATL_RKK_B2BLK_aX[ik] : ATL_RKK_BT2BLK_aX[ik];
         #endif
      }
   }
   return(appAl);
}


static INLINE void FillInRankKInf
   (rkinfo_t *out, int idx, enum ATLAS_TRANS TA, enum ATLAS_TRANS TB,
    ATL_CSZT M, ATL_CSZT N, ATL_CSZT K, size_t lda, size_t ldb, size_t ldc,
    const SCALAR alpha, const SCALAR beta, int mb, int nb)
{
   int kb = ATL_AMM_KBs[idx];
   const int mu=ATL_AMM_MUs[idx], nu=ATL_AMM_NUs[idx], ku=ATL_AMM_KUs[idx];

   if (!mb)
      mb = ATL_AMM_MBs[idx];
   if (!nb)
      nb = ATL_AMM_NBs[idx];
   out->nfmb = M/mb;
   out->nfnb = N/nb;
   out->nfkb = K/kb;
   out->mu = mu;
   out->nu = nu;
   out->ku = ku;
   out->lda = lda;
   out->ldb = ldb;
   out->ldc = ldc;
   out->mb = mb;
   out->nb = nb;
   out->kb = kb;
   out->nmu = mb / mu;
   out->nnu = nb / nu;
   out->amm_b0 = ATL_AMM_KERN_RKK[idx];
   out->amm_b1 = ATL_AMM_KERN_RKK_b1[idx];
   out->flag = ATL_AMM_KFLAG[idx];
   out->mbL = M - out->nfmb*mb;
   out->nbL = N - out->nfnb*nb;
   out->kbL = K - out->nfkb*kb;
   out->nmuL = (out->mbL+mu-1)/mu;
   out->nnuL = (out->nbL+nu-1)/nu;
   out->MBL = out->nmuL*mu;
   out->NBL = out->nnuL*nu;
   out->KBL = ((out->kbL+ku-1)/ku)*ku;
   if (TA == AtlasNoTrans)
   {
      out->a2blk  = ATL_RKK_AT2BLK_a1[idx];
      out->incAm = mb;
      out->incAk = kb * (lda SHIFT);
   }
   #ifdef TCPLX
   else if (TA == AtlasConj)
   {
      out->a2blk  = ATL_RKK_AH2BLK_a1[idx];
      out->incAm = mb+mb;
      out->incAk = kb * (lda+lda);
   }
   else if (TA == AtlasConjTrans)
   {
      out->a2blk  = ATL_RKK_AC2BLK_a1[idx];
      out->incAm = mb*(lda+lda);
      out->incAk = kb+kb;
   }
   #endif
   else
   {
      out->a2blk  = ATL_RKK_A2BLK_a1[idx];
      out->incAm = mb*(lda SHIFT);
      out->incAk = kb SHIFT;
   }

   if (SCALAR_IS_ZERO(beta))
      out->blk2c  = ATL_RKK_BLK2C_a1_b0[idx];
   else if (SCALAR_IS_ONE(beta))
      out->blk2c  = ATL_RKK_BLK2C_a1_b1[idx];
   else
      out->blk2c  = SCALAR_IS_NONE(beta) ?
                    ATL_RKK_BLK2C_a1_bn[idx]:ATL_RKK_BLK2C_a1_bX[idx];
   if (TB == AtlasNoTrans)
   {
      out->incBk = kb SHIFT;
      out->incBn = nb*(ldb SHIFT);
      if (SCALAR_IS_ONE(alpha))
         out->b2blk  = ATL_RKK_B2BLK_a1[idx];
      else
         out->b2blk  = SCALAR_IS_NONE(alpha) ?
                       ATL_RKK_B2BLK_an[idx]:ATL_RKK_B2BLK_aX[idx];
   }
   #ifdef TCPLX
   else if (TB == AtlasConjTrans)
   {
      out->incBk = kb*(ldb+ldb);
      out->incBn = nb+nb;
      if (SCALAR_IS_ONE(alpha))
         out->b2blk  = ATL_RKK_BH2BLK_a1[idx];
      else if (SCALAR_IS_NONE(alpha))
         out->b2blk  = ATL_RKK_BH2BLK_an[idx];
      else /* alpha = X */
         out->b2blk  = ATL_RKK_BH2BLK_aX[idx];
   }
   else if (TB == AtlasConj)
   {
      out->incBk = kb+kb;
      out->incBn = nb*(ldb+ldb);
      if (SCALAR_IS_ONE(alpha))
         out->b2blk  = ATL_RKK_BC2BLK_a1[idx];
      else if (SCALAR_IS_NONE(alpha))
         out->b2blk  = ATL_RKK_BC2BLK_an[idx];
      else /* alpha = X */
         out->b2blk  = ATL_RKK_BC2BLK_aX[idx];
   }
   #endif
   else  /* TB == AtlasTrans */
   {
      out->incBk = kb*(ldb SHIFT);
      out->incBn = nb SHIFT;
      if (SCALAR_IS_ONE(alpha))
         out->b2blk  = ATL_RKK_BT2BLK_a1[idx];
      else if (SCALAR_IS_NONE(alpha))
         out->b2blk  = ATL_RKK_BT2BLK_an[idx];
      else /* alpha = X */
         out->b2blk  = ATL_RKK_BT2BLK_aX[idx];
   }
}

void Mjoin(PATL,GetBestKBInfo)
(
   rkinfo_t *out,            /* what to use for full-KB blocks */
   rkinfo_t *outR,           /* what to use for K-cleanup */
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   ATL_CSZT K,
   size_t lda,
   size_t ldb,
   size_t ldc,
   const SCALAR alpha,
   const SCALAR beta
)
{
   int idx;        /* amm index to use for full kb calls */
   int idr=(-1);   /* amm index for final colpan (-1: use ger1/2) */
   int KR;         /* K%kb */
   int i;

   #if 1
   if (K <= ATL_AMM_MAXKB)
   {
      idx = -1;
      idr = (K > 2) ? K-3 : -1;
      KR = K;
   }
   else
   #endif
   {
      int ibest=0;
      double timB=M*N*K*ATL_rkAMM_TIME[0];
      for (i=0; i < ATL_AMM_NCASES; i++)
      {
         ATL_CINT kb=ATL_AMM_KBs[i], nkb=K/kb, kr=K-nkb*kb;
         ATL_CINT mb=ATL_AMM_MBs[i], nb=ATL_AMM_MBs[i], nmb=M/mb, nnb=N/nb;
         ATL_CINT mu = ATL_AMM_MUs[i], mr = M-nmb*mb;
         ATL_CINT nu = ATL_AMM_NUs[i], nr = N-nnb*nb;
         double nfcblks, tkr, tpan, tim;

         nfcblks = nmb;
         nfcblks *= nnb;
         if (kr > 2)
         {
            idr = kr-3;
            tkr = ATL_rkAMM_TIME[idr];
            tkr /= ((double)ATL_AMM_KBs[idr])*ATL_AMM_MBs[idr]*ATL_AMM_KBs[idr];
            tkr *= ((double)mb)*nb*kb;
         }
         else if (!kr)
         {
            idr = -2;
            tkr = 0.0;
         }
/*
 *       Strongly penalize kr==1,2, since they require double write of C
 */
         else
         {
            if (kr == 1)
               tkr = ATL_rkAMM_TIME[0] * 8.0;
            else
               tkr = ATL_rkAMM_TIME[0] * 6.0;
            tkr *= (0.015625*mb)*nb*kb;  /* b^3/4^3 */
         }
         tpan = nkb*ATL_rkAMM_TIME[i] + tkr;
         tim = nfcblks*tpan;
/*
 *       Rank-K perf not much affected by MB, so M cleanup only reduced
 *       by extra computation
 */
         if (mr)
            tim += nmb*tpan*mr / (double)mb;
#if 1
/*
 *       Small N drastically affects rank-K perf, so estimate it's speed
 *       as being equal to the mininum of rank-K or rank-nr
 *       We ignore kr for this, shouldn't be a big deal.
 */
         if (nr)
         {
            if (nr < K)
            {
               const int kk = (nr >= 3) ? nr-3 : 0;
               double tpe = ATL_rkAMM_TIME[kk];        /* time per element */
               tpe /= ((double)ATL_AMM_MBs[kk])*ATL_AMM_NBs[kk]*ATL_AMM_KBs[kk];
               tpe *= ((double)mb)*nr*kb;
               tim += tpe*nmb;
            }
            else
               tim += tpan;
         }
#endif
         if (tim < timB)
         {
            timB = tim;
            ibest = i;
            KR = kr;
         }
      }
#if 0
      ibest = 1;
      KR = K%4;
#elif 0
      ibest = 17;
      KR = K%20;
#endif
      idx = ibest;
      if (KR > 2)
         idr = KR-3;
      else
         idr = (KR == 0) ? -2 : -1;
   }
   out->idx = idx;
   if (idx != -1)
      FillInRankKInf(out, idx, TA, TB, M, N, K, lda, ldb, ldc,alpha,beta,0,0);
   outR->idx = idr;
   if (idr != -2)
   {
      const int mu=out->mu, nu=out->nu;
      int GOLOOK=(idr == -1);  /* should we look for kern wt same MU/NU? */

      if (!GOLOOK)
         GOLOOK = (mu != ATL_AMM_MUs[idr] || nu != ATL_AMM_NUs[idr]);
/*
 *    If present K-clean candidate doesn't have same C format, search
 *    for one that does
 */
      if (idx != -1 && GOLOOK)
      {
         int kbmin = ATL_AMM_KBMINs[idx];
/*
 *       See if present kernel can perform K-cleanup itself
 */
         if (ATL_AMM_KRUNTIME(idx) && ATL_AMM_KBMINs[idx] <= KR)
            idr = idx;
/*
 *       Look thru all avail kerns for one that matches MU/NU & can handle KR
 */
         else
         {
            int nidr=(-1);
            for (i=0; i < ATL_AMM_NCASES; i++)
            {
               if (ATL_AMM_KRUNTIME(i) && ATL_AMM_KBMINs[i] <= KR)
               {
                  nidr = i;
                  if (!ATL_AMM_KMAJOR(i)) /* K-vect kerns slow for small KR */
                     break;               /* so only quit if not K-vect */
               }
            }
/*
 *          May later want to search thru sq/ge KClean kerns for matching kerns.
 *          This will complicate things slighly, since the idx will be for
 *          the wrong header files.  For now, call present search OK
 */
            #if 0
            if (nidr == -1)
            {
               nidr = sqFindrkKClean(outR, mu, nu, KR);
               if (nidr == -1)
                  nidr = geFindrkKClean(outR, mu, nu, KR);
            }
            #endif
            if (nidr != -1)
               idr = nidr;
         }
      }
      if (idr >= 0)
      {
/*
 *       If we want to use this, will need to change amminstall to ensure
 *       this.  For now, simply assert it so we can assume compatible C
 */
         if (idx > 0)
         {
            FillInRankKInf(outR, idr, TA, TB, M, N, KR, lda, ldb, ldc,
                           alpha, beta, out->mb, out->nb);
            ATL_assert(outR->mu == mu && outR->nu == nu);
         }
         else
            FillInRankKInf(outR, idr, TA, TB, M, N, KR, lda, ldb, ldc,
                           alpha, beta, 0, 0);
      }
   }
   if (idx >= 0)
      printf("IDX=%d, B=(%d,%d,%d), U=(%d,%d,%d)\n",
             idx, out->mb, out->nb, out->kb, out->mu, out->nu, out->ku);
   else
      printf("IDX=%d\n", idx);
   if (idr >= 0)
      printf("IDR=%d, B=(%d,%d,%d), U=(%d,%d,%d)\n",
             idr, outR->mb, outR->nb, outR->kb, outR->mu, outR->nu, outR->ku);
   else
      printf("IDR=%d\n", idr);
}

