/*
 * Automatically Tuned Linear Algebra Software v3.11.38
 * Copyright (C) 2010 R. Clint Whaley
 */
#ifndef ATL_SSE1
   #error "This routine requires SSE1!"
#endif
#include <xmmintrin.h>
#include <stdio.h>
#include "atlas_misc.h"

void ATL_UGERK
   (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y,
    TYPE *A, ATL_CINT lda1)
{/* BEGIN GER: nMU=1, MU=8, NU=4 */
   ATL_INT i, j;
   ATL_CINT MAp = ( (((((size_t)A)+15)>>4)<<4) - ((size_t)A) )/sizeof(TYPE);
   ATL_CINT MA=M-MAp;
   ATL_CINT M8=((MA/8)*8)+MAp, N4=((N/4)*4), lda2=lda1+lda1, lda3=lda2+lda1, lda4=lda3+lda1;
   __m128 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, a0_0, m0_0, a1_0, m1_0, a2_0, m2_0, a3_0, m3_0, a4_0, m4_0, a5_0, m5_0, a6_0, m6_0, a7_0, m7_0, a0_1, m0_1, a1_1, m1_1, a2_1, m2_1, a3_1, m3_1, a4_1, m4_1, a5_1, m5_1, a6_1, m6_1, a7_1, m7_1, a0_2, m0_2, a1_2, m1_2, a2_2, m2_2, a3_2, m3_2, a4_2, m4_2, a5_2, m5_2, a6_2, m6_2, a7_2, m7_2, a0_3, m0_3, a1_3, m1_3, a2_3, m2_3, a3_3, m3_3, a4_3, m4_3, a5_3, m5_3, a6_3, m6_3, a7_3, m7_3;

   for (j=0; j < N4; j += 4, A += lda4, Y += 4)
   {/* BEGIN N-LOOP UR=4 */
      y0 = _mm_load1_ps(Y);
      y1 = _mm_load1_ps(Y+1);
      y2 = _mm_load1_ps(Y+2);
      y3 = _mm_load1_ps(Y+3);
      for (i=0; i < MAp; i++)
      {/* peel to force X/A alignment */
         x0 = _mm_load_ss(X+i+0);
         a0_0 = _mm_load_ss(A+i+0);
         m0_0 = _mm_mul_ss(x0, y0);
         a0_0 = _mm_add_ss(a0_0, m0_0);
         _mm_store_ss(A+i+0, a0_0);
         a0_1 = _mm_load_ss(A+i+0+lda1);
         m0_1 = _mm_mul_ss(x0, y1);
         a0_1 = _mm_add_ss(a0_1, m0_1);
         _mm_store_ss(A+i+0+lda1, a0_1);
         a0_2 = _mm_load_ss(A+i+0+lda2);
         m0_2 = _mm_mul_ss(x0, y2);
         a0_2 = _mm_add_ss(a0_2, m0_2);
         _mm_store_ss(A+i+0+lda2, a0_2);
         a0_3 = _mm_load_ss(A+i+0+lda3);
         m0_3 = _mm_mul_ss(x0, y3);
         a0_3 = _mm_add_ss(a0_3, m0_3);
         _mm_store_ss(A+i+0+lda3, a0_3);
      } /* end force-align peel */

      for (i=MAp; i < M8; i += 8)
      {/* ----- BEGIN M-LOOP BODY ----- */
         /* --- BEGIN MUxNU UNROLL 0 --- */
         x0 = _mm_load_ps(X+i+0);
         a0_0 = _mm_load_ps(A+i+0);
         m0_0 = _mm_mul_ps(x0, y0);
         a0_0 = _mm_add_ps(a0_0, m0_0);
         _mm_store_ps(A+i+0, a0_0);
         x4 = _mm_load_ps(X+i+4);
         a4_0 = _mm_load_ps(A+i+4);
         m4_0 = _mm_mul_ps(x4, y0);
         a4_0 = _mm_add_ps(a4_0, m4_0);
         _mm_store_ps(A+i+4, a4_0);
         a0_1 = _mm_load_ps(A+i+0+lda1);
         m0_1 = _mm_mul_ps(x0, y1);
         a0_1 = _mm_add_ps(a0_1, m0_1);
         _mm_store_ps(A+i+0+lda1, a0_1);
         a4_1 = _mm_load_ps(A+i+4+lda1);
         m4_1 = _mm_mul_ps(x4, y1);
         a4_1 = _mm_add_ps(a4_1, m4_1);
         _mm_store_ps(A+i+4+lda1, a4_1);
         a0_2 = _mm_load_ps(A+i+0+lda2);
         m0_2 = _mm_mul_ps(x0, y2);
         a0_2 = _mm_add_ps(a0_2, m0_2);
         _mm_store_ps(A+i+0+lda2, a0_2);
         a4_2 = _mm_load_ps(A+i+4+lda2);
         m4_2 = _mm_mul_ps(x4, y2);
         a4_2 = _mm_add_ps(a4_2, m4_2);
         _mm_store_ps(A+i+4+lda2, a4_2);
         a0_3 = _mm_load_ps(A+i+0+lda3);
         m0_3 = _mm_mul_ps(x0, y3);
         a0_3 = _mm_add_ps(a0_3, m0_3);
         _mm_store_ps(A+i+0+lda3, a0_3);
         a4_3 = _mm_load_ps(A+i+4+lda3);
         m4_3 = _mm_mul_ps(x4, y3);
         a4_3 = _mm_add_ps(a4_3, m4_3);
         _mm_store_ps(A+i+4+lda3, a4_3);
         /* --- END MUxNU UNROLL 0 --- */
      }/* ----- END M-LOOP BODY ----- */
      if (M != M8)
      {/* ----- BEGIN VECTOR UNROLL M CLEANUP ----- */

         for (i=M8; i < M; i++)
         {/* ----- BEGIN SCALAR M CLEANUP ----- */
            x0 = _mm_load_ss(X+i+0);
            a0_0 = _mm_load_ss(A+i+0);
            m0_0 = _mm_mul_ss(x0, y0);
            a0_0 = _mm_add_ss(a0_0, m0_0);
            _mm_store_ss(A+i+0, a0_0);
            a0_1 = _mm_load_ss(A+i+0+lda1);
            m0_1 = _mm_mul_ss(x0, y1);
            a0_1 = _mm_add_ss(a0_1, m0_1);
            _mm_store_ss(A+i+0+lda1, a0_1);
            a0_2 = _mm_load_ss(A+i+0+lda2);
            m0_2 = _mm_mul_ss(x0, y2);
            a0_2 = _mm_add_ss(a0_2, m0_2);
            _mm_store_ss(A+i+0+lda2, a0_2);
            a0_3 = _mm_load_ss(A+i+0+lda3);
            m0_3 = _mm_mul_ss(x0, y3);
            a0_3 = _mm_add_ss(a0_3, m0_3);
            _mm_store_ss(A+i+0+lda3, a0_3);
         }/* ----- END SCALAR M CLEANUP ----- */
      }/* ----- END VECTOR UNROLL M CLEANUP ----- */
   }/* END N-LOOP UR=4 */

   for (j=N4; j < N; j += 1, A += lda1, Y++)
   {/* BEGIN N-LOOP UR=1 */
      y0 = _mm_load1_ps(Y);
      for (i=0; i < MAp; i++)
      {/* peel to force X/A alignment */
         x0 = _mm_load_ss(X+i+0);
         a0_0 = _mm_load_ss(A+i+0);
         m0_0 = _mm_mul_ss(x0, y0);
         a0_0 = _mm_add_ss(a0_0, m0_0);
         _mm_store_ss(A+i+0, a0_0);
      } /* end force-align peel */

      for (i=MAp; i < M8; i += 8)
      {/* ----- BEGIN M-LOOP BODY ----- */
         /* --- BEGIN MUxNU UNROLL 0 --- */
         x0 = _mm_load_ps(X+i+0);
         a0_0 = _mm_load_ps(A+i+0);
         m0_0 = _mm_mul_ps(x0, y0);
         a0_0 = _mm_add_ps(a0_0, m0_0);
         _mm_store_ps(A+i+0, a0_0);
         x4 = _mm_load_ps(X+i+4);
         a4_0 = _mm_load_ps(A+i+4);
         m4_0 = _mm_mul_ps(x4, y0);
         a4_0 = _mm_add_ps(a4_0, m4_0);
         _mm_store_ps(A+i+4, a4_0);
         /* --- END MUxNU UNROLL 0 --- */
      }/* ----- END M-LOOP BODY ----- */
      if (M != M8)
      {/* ----- BEGIN VECTOR UNROLL M CLEANUP ----- */
         for (i=M8; i < M; i++)
         {/* ----- BEGIN SCALAR M CLEANUP ----- */
            x0 = _mm_load_ss(X+i+0);
            a0_0 = _mm_load_ss(A+i+0);
            m0_0 = _mm_mul_ss(x0, y0);
            a0_0 = _mm_add_ss(a0_0, m0_0);
            _mm_store_ss(A+i+0, a0_0);
         }/* ----- END SCALAR M CLEANUP ----- */
      }/* ----- END VECTOR UNROLL M CLEANUP ----- */
   }/* END N-LOOP UR=1 */
}/* END GER: nMU=1, MU=8, NU=4 */
#ifdef MA
   #undef MA
#endif
#ifdef MAp
   #undef MAp
#endif
