Skip to content

Commit 0a4d6b2

Browse files
committed
Forgot files from previous check-in.
1 parent cc1b579 commit 0a4d6b2

1 file changed

Lines changed: 28 additions & 58 deletions

File tree

kernel/riscv64/sgemm_kernel_16x8_zvl256b.c

Lines changed: 28 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2214,67 +2214,37 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
22142214

22152215
FLOAT *C2 = C;
22162216

2217-
vfloat32m2_t c00;
2218-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2219-
vfloat32m1_t c0 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2220-
vfloat32m1_t c1 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2221-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2222-
vfloat32m1_t c2 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2223-
vfloat32m1_t c3 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2224-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2225-
vfloat32m1_t c4 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2226-
vfloat32m1_t c5 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2227-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2228-
vfloat32m1_t c6 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2229-
vfloat32m1_t c7 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2230-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2231-
vfloat32m1_t c8 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2232-
vfloat32m1_t c9 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2233-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2234-
vfloat32m1_t c10 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2235-
vfloat32m1_t c11 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2236-
c00 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2237-
vfloat32m1_t c12 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2238-
vfloat32m1_t c13 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2239-
c00 = __riscv_vle32_v_f32m2(C, 16);
2240-
vfloat32m1_t c14 = __riscv_vget_v_f32m2_f32m1(c00, 0);
2241-
vfloat32m1_t c15 = __riscv_vget_v_f32m2_f32m1(c00, 1);
2242-
2243-
c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, 8 );
2244-
c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, 8 );
2245-
c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, 8 );
2246-
c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, 8 );
2247-
c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, 8 );
2248-
c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, 8 );
2249-
c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, 8 );
2250-
c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, 8 );
2251-
c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, 8 );
2252-
c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, 8 );
2253-
c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, 8 );
2254-
c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, 8 );
2255-
c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, 8 );
2256-
c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, 8 );
2257-
c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, 8 );
2258-
c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, 8 );
2217+
vfloat32m2_t c01 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2218+
vfloat32m2_t c23 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2219+
vfloat32m2_t c45 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2220+
vfloat32m2_t c67 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2221+
2222+
c01 = __riscv_vfmacc_vf_f32m2( c01, alpha, result01, 16 );
2223+
c23 = __riscv_vfmacc_vf_f32m2( c23, alpha, result23, 16 );
2224+
c45 = __riscv_vfmacc_vf_f32m2( c45, alpha, result45, 16 );
2225+
c67 = __riscv_vfmacc_vf_f32m2( c67, alpha, result67, 16 );
2226+
2227+
vfloat32m2_t c89 = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2228+
vfloat32m2_t cAB = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2229+
vfloat32m2_t cCD = __riscv_vle32_v_f32m2(C, 16); C += ldc;
2230+
vfloat32m2_t cEF = __riscv_vle32_v_f32m2(C, 16);
2231+
2232+
c89 = __riscv_vfmacc_vf_f32m2( c89, alpha, result89, 16 );
2233+
cAB = __riscv_vfmacc_vf_f32m2( cAB, alpha, resultAB, 16 );
2234+
cCD = __riscv_vfmacc_vf_f32m2( cCD, alpha, resultCD, 16 );
2235+
cEF = __riscv_vfmacc_vf_f32m2( cEF, alpha, resultEF, 16 );
22592236

22602237
C = C2;
22612238

2262-
c00 = __riscv_vcreate_v_f32m1_f32m2(c0, c1);
2263-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2264-
c00 = __riscv_vcreate_v_f32m1_f32m2(c2, c3);
2265-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2266-
c00 = __riscv_vcreate_v_f32m1_f32m2(c4, c5);
2267-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2268-
c00 = __riscv_vcreate_v_f32m1_f32m2(c6, c7);
2269-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2270-
c00 = __riscv_vcreate_v_f32m1_f32m2(c8, c9);
2271-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2272-
c00 = __riscv_vcreate_v_f32m1_f32m2(c10, c11);
2273-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2274-
c00 = __riscv_vcreate_v_f32m1_f32m2(c12, c13);
2275-
__riscv_vse32_v_f32m2(C, c00, 16); C += ldc;
2276-
c00 = __riscv_vcreate_v_f32m1_f32m2(c14, c15);
2277-
__riscv_vse32_v_f32m2(C, c00, 16);
2239+
__riscv_vse32_v_f32m2(C, c01, 16); C += ldc;
2240+
__riscv_vse32_v_f32m2(C, c23, 16); C += ldc;
2241+
__riscv_vse32_v_f32m2(C, c45, 16); C += ldc;
2242+
__riscv_vse32_v_f32m2(C, c67, 16); C += ldc;
2243+
__riscv_vse32_v_f32m2(C, c89, 16); C += ldc;
2244+
__riscv_vse32_v_f32m2(C, cAB, 16); C += ldc;
2245+
__riscv_vse32_v_f32m2(C, cCD, 16); C += ldc;
2246+
__riscv_vse32_v_f32m2(C, cEF, 16);
2247+
22782248
C = C2 + 16;
22792249
}
22802250

0 commit comments

Comments
 (0)