#include "atlas_misc.h"
#include "atlas_cplxsimd.h"

void ATL_UDOT(const int N, const TYPE *X, const int incx,
              const TYPE *Y, const int incy, SCALAR dot)
{
   ATL_VTYPE vdotR, vdotI, vX, vXr, vY;
   int NV=N, nr, i;

   ATL_vzero(vdotR);
   ATL_vzero(vdotI);
   if (N < ATL_CXVLEN)
   {
      i = N;
      ATL_vcxlduXuYR(vX, X, vY, Y, i);
      goto ONEANDDONE;
   }
/*
 * If neither ptr is alignable, to to unaligned code.  Otherwise, make X
 * be the alignable array, and peel to force X alignment
 */
   if ( ((size_t)X) & (ATL_sizeof-1) )
   {
      const TYPE *p;
      if ( ((size_t)Y) & (ATL_sizeof-1) )
         goto NOALIGN;
      p = X;  /* Y can be aligned, and X cannot */
      X = Y;  /* so set X ptr to alignable array */
      Y = p;  /* Y is unalignable in this case */
   }
/*
 * See if either array is already aligned, and let that one be X if so
 */
   i = (int)(((size_t)X) & (ATL_VLENb-1));
   if (i)
   {
      int j;
      j = (int)(((size_t)Y) & (ATL_VLENb-1));
      if (!j)
      {
         const TYPE *p=X;
         X = Y;
         Y = p;
         i = 0;
      }
   }
/*
 * If X is not aligned, but is alignable, peel to force alignment, or if
 * we don't have a full vect it, peel to fully handle the operation
 */
   if (i)
   {
      i = ATL_DivBySize(i);
      ATL_vcxlduXuYR(vX, X, vY, Y, i);
      X += i+i;
      Y += i+i;
   }
   else  /* otherwise, peel full vec iteration to zero dot */
   {
      i = ATL_VLEN>>1;
      ATL_vld(vX, X);
      ATL_vuld(vY, Y);  /* unknown alignment at this point! */
      X += ATL_VLEN;
      Y += ATL_VLEN;
   }
ONEANDDONE:
   ATL_vcxswapRI(vXr, vX);
   ATL_vmul(vdotR, vX, vY);
   ATL_vmul(vdotI, vXr, vY);
   NV = N - i;
   if (!NV)
      goto VEC_REDUCE;
   nr = NV;
   NV >>= ATL_CXVLSH;
   nr -= NV<<ATL_CXVLSH;
   if (NV)  /* have some vector its left */
   {
      if ( ((size_t)Y) & (ATL_VLENb-1) )  /* if Y not aligned */
          goto NOALIGNY;
      do
      {
         ATL_vld(vX, X);
         ATL_vcxswapRI(vXr, vX);
         ATL_vld(vY, Y);
         ATL_vmac(vdotR, vX, vY);
         ATL_vmac(vdotI, vXr, vY);
         X += ATL_VLEN;
         Y += ATL_VLEN;
      }
      while (--NV);
   }
   if (nr)  /* have a remainder */
   {
      ATL_vcxldXuYR(vY, Y, vX, X, nr);
      ATL_vcxswapRI(vXr, vX);
      ATL_vmac(vdotR, vX, vY);
      ATL_vmac(vdotI, vXr, vY);
   }
VEC_REDUCE:
   ATL_vcxdotcomb(vdotR, vdotI);
   ATL_vcxust1(dot, vdotR);
   return;
NOALIGNY:
   do
   {
      ATL_vld(vX, X);
      ATL_vcxswapRI(vXr, vX);
      ATL_vuld(vY, Y);
      ATL_vmac(vdotR, vX, vY);
      ATL_vmac(vdotI, vXr, vY);
      X += ATL_VLEN;
      Y += ATL_VLEN;
   }
   while (--NV);
   goto CLEANUP;
/*
 * Code where both pointers unalignable
 */
NOALIGN:
   nr = NV & ((ATL_VLEN>>1)-1);
   NV -= nr;
   for (i=0; i < NV; i += (ATL_VLEN>>1))
   {
      ATL_vuld(vX, X);
      ATL_vcxswapRI(vXr, vX);
      ATL_vuld(vY, Y);
      ATL_vmac(vdotR, vX, vY);
      ATL_vmac(vdotI, vXr, vY);
      X += ATL_VLEN;
      Y += ATL_VLEN;
   }
CLEANUP:
   if (nr)
   {
      ATL_vcxlduXuYR(vY, Y, vX, X, nr);
      ATL_vcxswapRI(vXr, vX);
      ATL_vmac(vdotR, vX, vY);
      ATL_vmac(vdotI, vXr, vY);
   }
   goto VEC_REDUCE;
}
