2 * memcpy - copy memory area
4 * Copyright (c) 2012-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
10 * ARMv8-a, AArch64, unaligned accesses.
41 /* This implementation of memcpy uses unaligned accesses and branchless
42 sequences to keep the code small, simple and improve performance.
44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45 copies of up to 128 bytes, and large copies. The overhead of the overlap
46 check is negligible since it is only required for large copies.
48 Large copies use a software pipelined loop processing 64 bytes per iteration.
49 The destination pointer is 16-byte aligned to minimize unaligned accesses.
50 The loop tail is handled by always copying 64 bytes from the end.
54 .type memcpy,%function
56 add srcend, src, count
57 add dstend, dstin, count
63 /* Small copies: 0..32 bytes. */
67 ldp D_l, D_h, [srcend, -16]
69 stp D_l, D_h, [dstend, -16]
72 /* Copy 8-15 bytes. */
86 ldr B_lw, [srcend, -4]
88 str B_lw, [dstend, -4]
91 /* Copy 0..3 bytes using a branchless sequence. */
96 ldrb C_lw, [srcend, -1]
97 ldrb B_lw, [src, tmp1]
99 strb B_lw, [dstin, tmp1]
100 strb C_lw, [dstend, -1]
105 /* Medium copies: 33..128 bytes. */
108 ldp B_l, B_h, [src, 16]
109 ldp C_l, C_h, [srcend, -32]
110 ldp D_l, D_h, [srcend, -16]
113 stp A_l, A_h, [dstin]
114 stp B_l, B_h, [dstin, 16]
115 stp C_l, C_h, [dstend, -32]
116 stp D_l, D_h, [dstend, -16]
120 /* Copy 65..128 bytes. */
122 ldp E_l, E_h, [src, 32]
123 ldp F_l, F_h, [src, 48]
126 ldp G_l, G_h, [srcend, -64]
127 ldp H_l, H_h, [srcend, -48]
128 stp G_l, G_h, [dstend, -64]
129 stp H_l, H_h, [dstend, -48]
131 stp A_l, A_h, [dstin]
132 stp B_l, B_h, [dstin, 16]
133 stp E_l, E_h, [dstin, 32]
134 stp F_l, F_h, [dstin, 48]
135 stp C_l, C_h, [dstend, -32]
136 stp D_l, D_h, [dstend, -16]
140 /* Copy more than 128 bytes. */
143 /* Copy 16 bytes and then align dst to 16-byte alignment. */
149 add count, count, tmp1 /* Count is now 16 too large. */
150 ldp A_l, A_h, [src, 16]
151 stp D_l, D_h, [dstin]
152 ldp B_l, B_h, [src, 32]
153 ldp C_l, C_h, [src, 48]
154 ldp D_l, D_h, [src, 64]!
155 subs count, count, 128 + 16 /* Test and readjust count. */
156 b.ls .Lcopy64_from_end
159 stp A_l, A_h, [dst, 16]
160 ldp A_l, A_h, [src, 16]
161 stp B_l, B_h, [dst, 32]
162 ldp B_l, B_h, [src, 32]
163 stp C_l, C_h, [dst, 48]
164 ldp C_l, C_h, [src, 48]
165 stp D_l, D_h, [dst, 64]!
166 ldp D_l, D_h, [src, 64]!
167 subs count, count, 64
170 /* Write the last iteration and copy 64 bytes from the end. */
172 ldp E_l, E_h, [srcend, -64]
173 stp A_l, A_h, [dst, 16]
174 ldp A_l, A_h, [srcend, -48]
175 stp B_l, B_h, [dst, 32]
176 ldp B_l, B_h, [srcend, -32]
177 stp C_l, C_h, [dst, 48]
178 ldp C_l, C_h, [srcend, -16]
179 stp D_l, D_h, [dst, 64]
180 stp E_l, E_h, [dstend, -64]
181 stp A_l, A_h, [dstend, -48]
182 stp B_l, B_h, [dstend, -32]
183 stp C_l, C_h, [dstend, -16]
186 .size memcpy,.-memcpy