Skip to content

Commit 7a8a7cf

Browse files
committed
igzip/riscv64: Optimize isal_adler32_rvv with 4x loop unrolling and tail agnostic(ta)
Signed-off-by: WenLei <lei.wen2@zte.com.cn>
1 parent 0c58317 commit 7a8a7cf

File tree

1 file changed

+53
-11
lines changed

1 file changed

+53
-11
lines changed

igzip/riscv64/igzip_isal_adler32_rvv.S

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,40 +34,82 @@ adler32_rvv:
3434
slli t2, a0, 48
3535
srli t2, t2, 48 // t2: A = adler32 & 0xffff;
3636
srliw t3, a0, 16 // t3: B = adler32 >> 16;
37-
beqz a2, 2f
37+
beqz a2, 4f
3838

39-
vsetvli zero, a2, e64, m8, tu, ma
39+
vsetvli t0, a2, e64, m8, ta, ma
4040
vmv.v.i v8, 0
4141
vmv.v.i v16, 0
4242
vmv.s.x v24, zero
4343
mv t6, a2 // t6 = length
4444
vsetvli zero, zero, e32, m4, tu, ma
4545
vmv.s.x v8, t2 // v8 = adler32 & 0xffff
4646

47-
1:
48-
vsetvli t1, a2, e8, m1, tu, ma
47+
slli t0, t0, 2 // t0 = 4*vl
48+
blt a2, t0, 1f
49+
50+
unroll_loop_4x:
51+
vsetvli t1, a2, e8, m1, ta, ma
4952
vle8.v v0, (a1)
50-
vsetvli zero, zero, e32, m4, tu, ma
53+
add a3, a1, t1
54+
vle8.v v1, (a3)
55+
add a4, a3, t1
56+
vle8.v v2, (a4)
57+
add a5, a4, t1
58+
vle8.v v3, (a5)
59+
mv t5, a2
60+
slli t4, t1, 2
61+
add a1, a1, t4
62+
sub a2, a2, t4
63+
64+
vsetvli zero, t1, e32, m4, tu, ma
5165
vzext.vf4 v4, v0
66+
vzext.vf4 v28, v1
5267
vid.v v12 // 0, 1, 2, .. vl-1
68+
vrsub.vx v12, v12, t5 // len, len-1, len-2
5369
vadd.vv v8, v8, v4
54-
vrsub.vx v12, v12, a2 // len, len-1, len-2
5570
vwmaccu.vv v16, v12, v4 // v16: B += weight * next
56-
sub a2, a2, t1
71+
vsub.vx v12, v12, t1 // len-vl, len-vl-1, len-vl-2
72+
vadd.vv v8, v8, v28
73+
vwmaccu.vv v16, v12, v28
74+
vzext.vf4 v4, v2
75+
vzext.vf4 v28, v3
76+
vsub.vx v12, v12, t1
77+
vadd.vv v8, v8, v4
78+
vwmaccu.vv v16, v12, v4
79+
vsub.vx v12, v12, t1
80+
vadd.vv v8, v8, v28
81+
vwmaccu.vv v16, v12, v28
82+
bge a2, t0, unroll_loop_4x
83+
84+
1:
85+
beqz a2, 3f
86+
mv t5, a2
87+
single:
88+
vsetvli t1, a2, e8, m1, ta, ma
89+
vle8.v v0, (a1)
90+
vsetvli zero, zero, e32, m4, tu, ma
91+
vzext.vf4 v4, v0
92+
vid.v v12 // 0, 1, 2, .. vl-1
93+
vadd.vv v8, v8, v4
94+
vrsub.vx v12, v12, t5 // len, len-1, len-2
95+
vwmaccu.vv v16, v12, v4 // v16: B += weight * next
96+
sub t5, t5, t1
5797
add a1, a1, t1
58-
bnez a2, 1b
98+
sub a2, a2, t1
99+
bnez a2, single
59100

101+
3:
60102
vsetvli zero, t6, e32, m4, tu, ma
61103
vwredsumu.vs v24, v8, v24
62104
mul a7, t6, t2 // B += A(init) * len
63105
vsetvli zero, t6, e64, m8, tu, ma
64106
vmv.s.x v0, a7
65107
vredsum.vs v0, v16, v0
66-
vmv.x.s t4, v0 // B = t4
108+
vmv.x.s a4, v0 // B = a4
67109
vmv.x.s t2, v24 // A = t2
68-
add t3, t4, t3
110+
add t3, a4, t3
69111

70-
2:
112+
4:
71113
li t0, 65521
72114
remu t2, t2, t0 // A = A % ADLER_MOD
73115
remu t3, t3, t0 // B = B % ADLER_MOD

0 commit comments

Comments
 (0)