nsz Git - musl/blob - src/string/aarch64/memcpy.S

   1 /*
   2  * memcpy - copy memory area
   3  *
   4  * Copyright (c) 2012-2020, Arm Limited.
   5  * SPDX-License-Identifier: MIT
   6  */
   7
   8 /* Assumptions:
   9  *
  10  * ARMv8-a, AArch64, unaligned accesses.
  11  *
  12  */
  13
  14 #define dstin   x0
  15 #define src     x1
  16 #define count   x2
  17 #define dst     x3
  18 #define srcend  x4
  19 #define dstend  x5
  20 #define A_l     x6
  21 #define A_lw    w6
  22 #define A_h     x7
  23 #define B_l     x8
  24 #define B_lw    w8
  25 #define B_h     x9
  26 #define C_l     x10
  27 #define C_lw    w10
  28 #define C_h     x11
  29 #define D_l     x12
  30 #define D_h     x13
  31 #define E_l     x14
  32 #define E_h     x15
  33 #define F_l     x16
  34 #define F_h     x17
  35 #define G_l     count
  36 #define G_h     dst
  37 #define H_l     src
  38 #define H_h     srcend
  39 #define tmp1    x14
  40
  41 /* This implementation of memcpy uses unaligned accesses and branchless
  42    sequences to keep the code small, simple and improve performance.
  43
  44    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  45    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  46    check is negligible since it is only required for large copies.
  47
  48    Large copies use a software pipelined loop processing 64 bytes per iteration.
  49    The destination pointer is 16-byte aligned to minimize unaligned accesses.
  50    The loop tail is handled by always copying 64 bytes from the end.
  51 */
  52
  53 .global memcpy
  54 .type memcpy,%function
  55 memcpy:
  56         add     srcend, src, count
  57         add     dstend, dstin, count
  58         cmp     count, 128
  59         b.hi    .Lcopy_long
  60         cmp     count, 32
  61         b.hi    .Lcopy32_128
  62
  63         /* Small copies: 0..32 bytes.  */
  64         cmp     count, 16
  65         b.lo    .Lcopy16
  66         ldp     A_l, A_h, [src]
  67         ldp     D_l, D_h, [srcend, -16]
  68         stp     A_l, A_h, [dstin]
  69         stp     D_l, D_h, [dstend, -16]
  70         ret
  71
  72         /* Copy 8-15 bytes.  */
  73 .Lcopy16:
  74         tbz     count, 3, .Lcopy8
  75         ldr     A_l, [src]
  76         ldr     A_h, [srcend, -8]
  77         str     A_l, [dstin]
  78         str     A_h, [dstend, -8]
  79         ret
  80
  81         .p2align 3
  82         /* Copy 4-7 bytes.  */
  83 .Lcopy8:
  84         tbz     count, 2, .Lcopy4
  85         ldr     A_lw, [src]
  86         ldr     B_lw, [srcend, -4]
  87         str     A_lw, [dstin]
  88         str     B_lw, [dstend, -4]
  89         ret
  90
  91         /* Copy 0..3 bytes using a branchless sequence.  */
  92 .Lcopy4:
  93         cbz     count, .Lcopy0
  94         lsr     tmp1, count, 1
  95         ldrb    A_lw, [src]
  96         ldrb    C_lw, [srcend, -1]
  97         ldrb    B_lw, [src, tmp1]
  98         strb    A_lw, [dstin]
  99         strb    B_lw, [dstin, tmp1]
 100         strb    C_lw, [dstend, -1]
 101 .Lcopy0:
 102         ret
 103
 104         .p2align 4
 105         /* Medium copies: 33..128 bytes.  */
 106 .Lcopy32_128:
 107         ldp     A_l, A_h, [src]
 108         ldp     B_l, B_h, [src, 16]
 109         ldp     C_l, C_h, [srcend, -32]
 110         ldp     D_l, D_h, [srcend, -16]
 111         cmp     count, 64
 112         b.hi    .Lcopy128
 113         stp     A_l, A_h, [dstin]
 114         stp     B_l, B_h, [dstin, 16]
 115         stp     C_l, C_h, [dstend, -32]
 116         stp     D_l, D_h, [dstend, -16]
 117         ret
 118
 119         .p2align 4
 120         /* Copy 65..128 bytes.  */
 121 .Lcopy128:
 122         ldp     E_l, E_h, [src, 32]
 123         ldp     F_l, F_h, [src, 48]
 124         cmp     count, 96
 125         b.ls    .Lcopy96
 126         ldp     G_l, G_h, [srcend, -64]
 127         ldp     H_l, H_h, [srcend, -48]
 128         stp     G_l, G_h, [dstend, -64]
 129         stp     H_l, H_h, [dstend, -48]
 130 .Lcopy96:
 131         stp     A_l, A_h, [dstin]
 132         stp     B_l, B_h, [dstin, 16]
 133         stp     E_l, E_h, [dstin, 32]
 134         stp     F_l, F_h, [dstin, 48]
 135         stp     C_l, C_h, [dstend, -32]
 136         stp     D_l, D_h, [dstend, -16]
 137         ret
 138
 139         .p2align 4
 140         /* Copy more than 128 bytes.  */
 141 .Lcopy_long:
 142
 143         /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 144
 145         ldp     D_l, D_h, [src]
 146         and     tmp1, dstin, 15
 147         bic     dst, dstin, 15
 148         sub     src, src, tmp1
 149         add     count, count, tmp1      /* Count is now 16 too large.  */
 150         ldp     A_l, A_h, [src, 16]
 151         stp     D_l, D_h, [dstin]
 152         ldp     B_l, B_h, [src, 32]
 153         ldp     C_l, C_h, [src, 48]
 154         ldp     D_l, D_h, [src, 64]!
 155         subs    count, count, 128 + 16  /* Test and readjust count.  */
 156         b.ls    .Lcopy64_from_end
 157
 158 .Lloop64:
 159         stp     A_l, A_h, [dst, 16]
 160         ldp     A_l, A_h, [src, 16]
 161         stp     B_l, B_h, [dst, 32]
 162         ldp     B_l, B_h, [src, 32]
 163         stp     C_l, C_h, [dst, 48]
 164         ldp     C_l, C_h, [src, 48]
 165         stp     D_l, D_h, [dst, 64]!
 166         ldp     D_l, D_h, [src, 64]!
 167         subs    count, count, 64
 168         b.hi    .Lloop64
 169
 170         /* Write the last iteration and copy 64 bytes from the end.  */
 171 .Lcopy64_from_end:
 172         ldp     E_l, E_h, [srcend, -64]
 173         stp     A_l, A_h, [dst, 16]
 174         ldp     A_l, A_h, [srcend, -48]
 175         stp     B_l, B_h, [dst, 32]
 176         ldp     B_l, B_h, [srcend, -32]
 177         stp     C_l, C_h, [dst, 48]
 178         ldp     C_l, C_h, [srcend, -16]
 179         stp     D_l, D_h, [dst, 64]
 180         stp     E_l, E_h, [dstend, -64]
 181         stp     A_l, A_h, [dstend, -48]
 182         stp     B_l, B_h, [dstend, -32]
 183         stp     C_l, C_h, [dstend, -16]
 184         ret
 185
 186 .size memcpy,.-memcpy