/*
 * Automatically Tuned Linear Algebra Software v3.11.38
 * Copyright (C) 2013 R. Clint Whaley
 */
#ifndef ATL_GAS_x8664
   #error "This kernel requires x86-64 assembly!"
#endif
#ifndef ATL_SSE3
   #error "This routine requires SSE3!"
#endif
#include "atlas_asm.h"

#define rm0     %xmm0
#define rB0     %xmm1
#define rC0     %xmm2
#define rC1     %xmm3
#define rC2     %xmm4
#define rC3     %xmm5
#define rC4     %xmm6
#define rC5     %xmm7
#define rC6     %xmm8
#define rC7     %xmm9
#define rC8     %xmm10
#define rC9     %xmm11
#define rC10    %xmm12
#define rC11    %xmm13

/* #define KK      %rdx */   /* API register */
#define pA      %rcx   /* API reg */
#define pB      %rax   /* comes in as r8 */
#define NMU     %rdi   /* API reg */
#define NNU     %rsi   /* API reg */
#define pC      %r9    /* API reg */
#define pfB     %r10
#define pfA     %r8

#define NNU0    %r11
#define incA    %r12
#define pB0     %r13
#define r192    %r14
/* #define K0      %r15 */
#define FSIZE 6*8
#ifdef BETAN1
   #define VOP subps
#elif defined(BETA1)
   #define VOP addps
#elif defined(VOP)
   #undef VOP
#endif
/*
                    rdi      rsi    rdx        rcx         r8        r9
void ATL_USERMM(SZT nmu, SZT nnu, SZT K, CTYPE *pA, CTYPE *pB, TYPE *pC,
                  8(%rsp)    16(%rsp)     24(%rsp)
                CTYPE *pAn, CTYPE *pBn, CTYPE *pCn);
 */
.text
.global ATL_asmdecor(ATL_USERMM)
ALIGN16
ATL_asmdecor(ATL_USERMM):
/*
 * Save callee-saved iregs
 */
   sub  $FSIZE, %rsp
   movq %rbp, (%rsp)
   movq %rbx, 8(%rsp)
   movq %r12, 16(%rsp)
   movq %r13, 24(%rsp)
   movq %r14, 32(%rsp)
   movq %r15, 40(%rsp)

   mov %r8, pB
   sub $-128, pA
   sub $-128, pB
   mov pB, pB0
   movq FSIZE+16(%rsp), pfB
   movq FSIZE+8(%rsp), pfA
   mov $12*KB*4, incA   /* incA = 12*K*sizeof */
   mov $192, r192
   mov NNU, NNU0

   ALIGN16
   MLOOP:
/*      NLOOP: */
         movaps -128(pB), rC11
         movaps -128(pA), rC0
         mulps rC11, rC0
         movaps -112(pA), rC1
            prefetchw (pC)
         mulps rC11, rC1
         movaps -96(pA), rC2
            prefetcht0 (pfB)
         mulps rC11, rC2
         movaps -80(pA), rC3
            prefetcht0 (pfA)
         mulps rC11, rC3
         movaps -64(pA), rC4
         mulps rC11, rC4
         movaps -48(pA), rC5
         mulps rC11, rC5
         movaps -32(pA), rC6
         mulps rC11, rC6
         movaps -16(pA), rC7
         mulps rC11, rC7
         movaps (pA), rC8
         mulps rC11, rC8
         movaps 16(pA), rC9
         mulps rC11, rC9
         movaps 32(pA), rC10
         mulps rC11, rC10
         mulps 48(pA), rC11

            add $48, pfB
            add $48, pfA
/*         KLOOP: */
         #if KB > 4
            movaps -112(pB), rB0
            movaps 64(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 80(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 96(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 160(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 192(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 224(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 240(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 8
            movaps -96(pB), rB0
            movaps 256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 272(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 304(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 352(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 384(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 416(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 432(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 12
            movaps -80(pB), rB0
            movaps 448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 464(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 496(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 544(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 576(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 608(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 624(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 16
            movaps -64(pB), rB0
            movaps 640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 656(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 688(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 736(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 768(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 800(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 816(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 20
            movaps -48(pB), rB0
            movaps 832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 848(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 880(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 928(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 960(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 992(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1008(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 24
            movaps -32(pB), rB0
            movaps 1024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 1040(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 1056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 1072(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 1088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 1104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 1120(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 1136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 1152(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  1168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 1184(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1200(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 28
            movaps -16(pB), rB0
            movaps 1216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 1232(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 1248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 1264(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 1280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 1296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 1312(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 1328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 1344(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  1360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 1376(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1392(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 32
            movaps 0(pB), rB0
            movaps 1408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 1424(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 1440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 1456(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 1472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 1488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 1504(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 1520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 1536(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  1552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 1568(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1584(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 36
            movaps 16(pB), rB0
            movaps 1600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 1616(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 1632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 1648(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 1664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 1680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 1696(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 1712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 1728(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  1744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 1760(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1776(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 40
            movaps 32(pB), rB0
            movaps 1792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 1808(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 1824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 1840(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 1856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 1872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 1888(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 1904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 1920(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  1936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 1952(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 1968(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 44
            movaps 48(pB), rB0
            movaps 1984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2000(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2032(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 2048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 2064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 2080(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 2096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 2112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  2128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 2144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 2160(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 48
            movaps 64(pB), rB0
            movaps 2176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2192(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2224(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 2240(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 2256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 2272(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 2288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 2304(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  2320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 2336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 2352(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 52
            movaps 80(pB), rB0
            movaps 2368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2384(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2416(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 2432(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 2448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 2464(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 2480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 2496(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  2512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 2528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 2544(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 56
            movaps 96(pB), rB0
            movaps 2560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2576(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2608(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 2624(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 2640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 2656(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 2672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 2688(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  2704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 2720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 2736(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 60
            movaps 112(pB), rB0
            movaps 2752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2768(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2800(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 2816(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 2832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 2848(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 2864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 2880(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  2896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 2912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 2928(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 64
            movaps 128(pB), rB0
            movaps 2944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 2960(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 2976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 2992(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3008(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 3040(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 3056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 3072(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  3088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 3104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 3120(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 68
            movaps 144(pB), rB0
            movaps 3136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 3152(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 3168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 3184(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3200(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 3232(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 3248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 3264(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  3280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 3296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 3312(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 72
            movaps 160(pB), rB0
            movaps 3328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 3344(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 3360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 3376(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3392(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 3424(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 3440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 3456(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  3472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 3488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 3504(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 76
            movaps 176(pB), rB0
            movaps 3520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 3536(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 3552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 3568(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3584(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 3616(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 3632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 3648(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  3664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 3680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 3696(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 80
            movaps 192(pB), rB0
            movaps 3712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 3728(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 3744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 3760(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3776(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 3808(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 3824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 3840(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  3856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 3872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 3888(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 84
            movaps 208(pB), rB0
            movaps 3904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 3920(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 3936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 3952(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 3968(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 3984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4000(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4032(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  4048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 4064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 4080(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 88
            movaps 224(pB), rB0
            movaps 4096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 4112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 4128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 4144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 4160(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 4176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4192(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4224(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  4240(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 4256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 4272(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 92
            movaps 240(pB), rB0
            movaps 4288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 4304(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 4320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 4336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 4352(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 4368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4384(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4416(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  4432(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 4448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 4464(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 96
            movaps 256(pB), rB0
            movaps 4480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 4496(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 4512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 4528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 4544(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 4560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4576(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4608(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  4624(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 4640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 4656(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 100
            movaps 272(pB), rB0
            movaps 4672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 4688(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 4704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 4720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 4736(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 4752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4768(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4800(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  4816(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 4832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 4848(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 104
            movaps 288(pB), rB0
            movaps 4864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 4880(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 4896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 4912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 4928(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 4944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 4960(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 4976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 4992(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5008(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 5040(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 108
            movaps 304(pB), rB0
            movaps 5056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 5072(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 5088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 5104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 5120(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 5136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 5152(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 5168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 5184(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5200(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 5232(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 112
            movaps 320(pB), rB0
            movaps 5248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 5264(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 5280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 5296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 5312(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 5328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 5344(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 5360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 5376(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5392(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 5424(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 116
            movaps 336(pB), rB0
            movaps 5440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 5456(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 5472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 5488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 5504(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 5520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 5536(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 5552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 5568(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5584(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 5616(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 120
            movaps 352(pB), rB0
            movaps 5632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 5648(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 5664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 5680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 5696(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 5712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 5728(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 5744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 5760(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5776(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 5808(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 124
            movaps 368(pB), rB0
            movaps 5824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 5840(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 5856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 5872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 5888(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 5904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 5920(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 5936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 5952(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  5968(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 5984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6000(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 128
            movaps 384(pB), rB0
            movaps 6016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6032(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 6048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 6064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 6080(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 6096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 6112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 6128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 6144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  6160(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 6176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6192(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 132
            movaps 400(pB), rB0
            movaps 6208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6224(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 6240(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 6256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 6272(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 6288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 6304(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 6320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 6336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  6352(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 6368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6384(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 136
            movaps 416(pB), rB0
            movaps 6400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6416(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 6432(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 6448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 6464(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 6480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 6496(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 6512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 6528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  6544(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 6560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6576(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 140
            movaps 432(pB), rB0
            movaps 6592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6608(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 6624(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 6640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 6656(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 6672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 6688(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 6704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 6720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  6736(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 6752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6768(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 144
            movaps 448(pB), rB0
            movaps 6784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6800(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 6816(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 6832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 6848(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 6864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 6880(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 6896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 6912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  6928(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 6944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 6960(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 148
            movaps 464(pB), rB0
            movaps 6976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 6992(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7008(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 7040(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 7056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 7072(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 7088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 7104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  7120(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 7136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 7152(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 152
            movaps 480(pB), rB0
            movaps 7168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 7184(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7200(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 7232(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 7248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 7264(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 7280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 7296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  7312(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 7328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 7344(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 156
            movaps 496(pB), rB0
            movaps 7360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 7376(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7392(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 7424(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 7440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 7456(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 7472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 7488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  7504(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 7520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 7536(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 160
            movaps 512(pB), rB0
            movaps 7552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 7568(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7584(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 7616(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 7632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 7648(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 7664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 7680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  7696(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 7712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 7728(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 164
            movaps 528(pB), rB0
            movaps 7744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 7760(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7776(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 7808(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 7824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 7840(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 7856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 7872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  7888(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 7904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 7920(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 168
            movaps 544(pB), rB0
            movaps 7936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 7952(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 7968(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 7984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8000(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8032(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 8048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 8064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  8080(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 8096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 8112(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 172
            movaps 560(pB), rB0
            movaps 8128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 8144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 8160(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 8176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8192(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8224(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 8240(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 8256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  8272(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 8288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 8304(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 176
            movaps 576(pB), rB0
            movaps 8320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 8336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 8352(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 8368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8384(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8416(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 8432(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 8448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  8464(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 8480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 8496(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 180
            movaps 592(pB), rB0
            movaps 8512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 8528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 8544(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 8560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8576(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8608(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 8624(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 8640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  8656(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 8672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 8688(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 184
            movaps 608(pB), rB0
            movaps 8704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 8720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 8736(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 8752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8768(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8800(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 8816(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 8832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  8848(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 8864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 8880(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 188
            movaps 624(pB), rB0
            movaps 8896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 8912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 8928(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 8944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 8960(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 8976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 8992(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9008(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  9040(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 9056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 9072(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 192
            movaps 640(pB), rB0
            movaps 9088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 9104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 9120(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 9136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 9152(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 9168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 9184(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9200(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  9232(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 9248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 9264(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 196
            movaps 656(pB), rB0
            movaps 9280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 9296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 9312(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 9328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 9344(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 9360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 9376(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9392(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  9424(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 9440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 9456(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 200
            movaps 672(pB), rB0
            movaps 9472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 9488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 9504(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 9520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 9536(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 9552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 9568(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9584(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  9616(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 9632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 9648(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 204
            movaps 688(pB), rB0
            movaps 9664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 9680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 9696(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 9712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 9728(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 9744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 9760(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9776(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  9808(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 9824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 9840(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 208
            movaps 704(pB), rB0
            movaps 9856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 9872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 9888(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 9904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 9920(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 9936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 9952(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 9968(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 9984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10000(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10032(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 212
            movaps 720(pB), rB0
            movaps 10048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 10064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 10080(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 10096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 10112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 10128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 10144(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 10160(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 10176(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10192(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10208(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10224(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 216
            movaps 736(pB), rB0
            movaps 10240(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 10256(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 10272(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 10288(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 10304(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 10320(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 10336(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 10352(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 10368(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10384(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10400(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10416(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 220
            movaps 752(pB), rB0
            movaps 10432(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 10448(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 10464(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 10480(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 10496(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 10512(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 10528(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 10544(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 10560(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10576(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10592(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10608(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 224
            movaps 768(pB), rB0
            movaps 10624(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 10640(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 10656(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 10672(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 10688(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 10704(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 10720(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 10736(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 10752(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10768(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10784(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10800(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 228
            movaps 784(pB), rB0
            movaps 10816(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 10832(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 10848(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 10864(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 10880(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 10896(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 10912(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 10928(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 10944(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  10960(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 10976(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 10992(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 232
            movaps 800(pB), rB0
            movaps 11008(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11024(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 11040(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 11056(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 11072(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 11088(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 11104(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 11120(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 11136(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  11152(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 11168(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 11184(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 236
            movaps 816(pB), rB0
            movaps 11200(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11216(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 11232(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 11248(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 11264(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 11280(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 11296(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 11312(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 11328(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  11344(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 11360(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 11376(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 240
            movaps 832(pB), rB0
            movaps 11392(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11408(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 11424(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 11440(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 11456(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 11472(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 11488(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 11504(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 11520(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  11536(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 11552(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 11568(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 244
            movaps 848(pB), rB0
            movaps 11584(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11600(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 11616(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 11632(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 11648(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 11664(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 11680(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 11696(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 11712(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  11728(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 11744(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 11760(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 248
            movaps 864(pB), rB0
            movaps 11776(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11792(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 11808(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 11824(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 11840(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 11856(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 11872(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 11888(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 11904(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  11920(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 11936(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 11952(pA), rB0
            addps rB0, rC11
         #endif
         #if KB > 252
            movaps 880(pB), rB0
            movaps 11968(pA), rm0
            mulps rB0, rm0
            addps rm0, rC0
            movaps 11984(pA), rm0
            mulps rB0, rm0
            addps rm0, rC1
            movaps 12000(pA), rm0
            mulps rB0, rm0
            addps rm0, rC2
            movaps 12016(pA), rm0
            mulps rB0, rm0
            addps rm0, rC3
            movaps 12032(pA), rm0
            mulps rB0, rm0
            addps rm0, rC4
            movaps 12048(pA), rm0
            mulps rB0, rm0
            addps rm0, rC5
            movaps 12064(pA), rm0
            mulps rB0, rm0
            addps rm0, rC6
            movaps 12080(pA), rm0
            mulps rB0, rm0
            addps rm0, rC7
            movaps 12096(pA), rm0
            mulps rB0, rm0
            addps rm0, rC8
            movaps  12112(pA), rm0
            mulps rB0, rm0
            addps rm0, rC9
            movaps 12128(pA), rm0
            mulps rB0, rm0
            addps rm0, rC10
            mulps 12144(pA), rB0
            addps rB0, rC11
         #endif
KDONE:
/*
 *       Sum up rCx regs
 */
         haddps rC1, rC0        /* rC0 = {c1cd,c1ab,c0cd,c0ab} */
         haddps rC3, rC2        /* rC2 = {c3cd,c3ab,c2cd,c2ab} */
         haddps rC2, rC0        /* rC0 = {c3abcd,c2abcd,c1abcd,c0abcd} */
         #ifdef VOP
            VOP (pC), rC0
         #endif
         movaps rC0, (pC)
         haddps rC5, rC4        /* rC4 = {c5cd,c5ab,c4cd,c4ab} */
         haddps rC7, rC6        /* rC6 = {c7cd,c7ab,c6cd,c6ab} */
         haddps rC6, rC4        /* rC4 = {c3abcd,c2abcd,c1abcd,c0abcd} */
         #ifdef VOP
            VOP 16(pC), rC4
         #endif
         movaps rC4, 16(pC)
         haddps rC9, rC8        /* rC8 = {c9cd,c9ab,c8cd,c8ab} */
         haddps rC11, rC10      /* rC10 = {c11cd,c11ab,c10cd,c10ab} */
         haddps rC10, rC8       /* rC8 = {c11abcd,c10abcd,c9abcd,c8abcd} */
         #ifdef VOP
            VOP 32(pC), rC8
         #endif
         movaps rC8, 32(pC)
         add $48, pC
         add $KB*4, pB
         sub $1, NNU
      jnz MLOOP
      add incA, pA   /* pA += 12*K*sizeof */
      mov pB0, pB
      sub $1, NMU
      mov NNU0, NNU
   jnz MLOOP

DONE:
   movq (%rsp), %rbp
   movq 8(%rsp), %rbx
   movq 16(%rsp), %r12
   movq 24(%rsp), %r13
   movq 32(%rsp), %r14
   movq 40(%rsp), %r15
   add  $FSIZE, %rsp
   ret
