nsz Git - musl/blob - src/string/arm/memcpy.S

   1 /*
   2  * Copyright (C) 2008 The Android Open Source Project
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *  * Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  *  * Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in
  12  *    the documentation and/or other materials provided with the
  13  *    distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29
  30 /*
  31  * Optimized memcpy() for ARM.
  32  *
  33  * note that memcpy() always returns the destination pointer,
  34  * so we have to preserve R0.
  35   */
  36
  37 /*
  38  * This file has been modified from the original for use in musl libc.
  39  * The main changes are: addition of .type memcpy,%function to make the
  40  * code safely callable from thumb mode, adjusting the return
  41  * instructions to be compatible with pre-thumb ARM cpus, removal of
  42  * prefetch code that is not compatible with older cpus and support for
  43  * building as thumb 2 and big-endian.
  44  */
  45
  46 .syntax unified
  47
  48 .global memcpy
  49 .type memcpy,%function
  50 memcpy:
  51         /* The stack must always be 64-bits aligned to be compliant with the
  52          * ARM ABI. Since we have to save R0, we might as well save R4
  53          * which we can use for better pipelining of the reads below
  54          */
  55         .fnstart
  56         .save       {r0, r4, lr}
  57         stmfd       sp!, {r0, r4, lr}
  58         /* Making room for r5-r11 which will be spilled later */
  59         .pad        #28
  60         sub         sp, sp, #28
  61
  62         /* it simplifies things to take care of len<4 early */
  63         cmp     r2, #4
  64         blo     copy_last_3_and_return
  65
  66         /* compute the offset to align the source
  67          * offset = (4-(src&3))&3 = -src & 3
  68          */
  69         rsb     r3, r1, #0
  70         ands    r3, r3, #3
  71         beq     src_aligned
  72
  73         /* align source to 32 bits. We need to insert 2 instructions between
  74          * a ldr[b|h] and str[b|h] because byte and half-word instructions
  75          * stall 2 cycles.
  76          */
  77         movs    r12, r3, lsl #31
  78         sub     r2, r2, r3              /* we know that r3 <= r2 because r2 >= 4 */
  79         ldrbmi r3, [r1], #1
  80         ldrbcs r4, [r1], #1
  81         ldrbcs r12,[r1], #1
  82         strbmi r3, [r0], #1
  83         strbcs r4, [r0], #1
  84         strbcs r12,[r0], #1
  85
  86 src_aligned:
  87
  88         /* see if src and dst are aligned together (congruent) */
  89         eor     r12, r0, r1
  90         tst     r12, #3
  91         bne     non_congruent
  92
  93         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
  94          * frame. Don't update sp.
  95          */
  96         stmea   sp, {r5-r11}
  97
  98         /* align the destination to a cache-line */
  99         rsb     r3, r0, #0
 100         ands    r3, r3, #0x1C
 101         beq     congruent_aligned32
 102         cmp     r3, r2
 103         andhi   r3, r2, #0x1C
 104
 105         /* conditionnaly copies 0 to 7 words (length in r3) */
 106         movs    r12, r3, lsl #28
 107         ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */
 108         ldmmi   r1!, {r8, r9}                   /*  8 bytes */
 109         stmcs   r0!, {r4, r5, r6, r7}
 110         stmmi   r0!, {r8, r9}
 111         tst     r3, #0x4
 112         ldrne   r10,[r1], #4                    /*  4 bytes */
 113         strne   r10,[r0], #4
 114         sub     r2, r2, r3
 115
 116 congruent_aligned32:
 117         /*
 118          * here source is aligned to 32 bytes.
 119          */
 120
 121 cached_aligned32:
 122         subs    r2, r2, #32
 123         blo     less_than_32_left
 124
 125         /*
 126          * We preload a cache-line up to 64 bytes ahead. On the 926, this will
 127          * stall only until the requested world is fetched, but the linefill
 128          * continues in the the background.
 129          * While the linefill is going, we write our previous cache-line
 130          * into the write-buffer (which should have some free space).
 131          * When the linefill is done, the writebuffer will
 132          * start dumping its content into memory
 133          *
 134          * While all this is going, we then load a full cache line into
 135          * 8 registers, this cache line should be in the cache by now
 136          * (or partly in the cache).
 137          *
 138          * This code should work well regardless of the source/dest alignment.
 139          *
 140          */
 141
 142         /* Align the preload register to a cache-line because the cpu does
 143          * "critical word first" (the first word requested is loaded first).
 144          */
 145         @ bic           r12, r1, #0x1F
 146         @ add           r12, r12, #64
 147
 148 1:      ldmia   r1!, { r4-r11 }
 149         subs    r2, r2, #32
 150
 151         /*
 152          * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
 153          * for ARM9 preload will not be safely guarded by the preceding subs.
 154          * When it is safely guarded the only possibility to have SIGSEGV here
 155          * is because the caller overstates the length.
 156          */
 157         @ ldrhi         r3, [r12], #32      /* cheap ARM9 preload */
 158         stmia   r0!, { r4-r11 }
 159         bhs     1b
 160
 161         add     r2, r2, #32
 162
 163 less_than_32_left:
 164         /*
 165          * less than 32 bytes left at this point (length in r2)
 166          */
 167
 168         /* skip all this if there is nothing to do, which should
 169          * be a common case (if not executed the code below takes
 170          * about 16 cycles)
 171          */
 172         tst     r2, #0x1F
 173         beq     1f
 174
 175         /* conditionnaly copies 0 to 31 bytes */
 176         movs    r12, r2, lsl #28
 177         ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */
 178         ldmmi   r1!, {r8, r9}                   /*  8 bytes */
 179         stmcs   r0!, {r4, r5, r6, r7}
 180         stmmi   r0!, {r8, r9}
 181         movs    r12, r2, lsl #30
 182         ldrcs   r3, [r1], #4                    /*  4 bytes */
 183         ldrhmi r4, [r1], #2                     /*  2 bytes */
 184         strcs   r3, [r0], #4
 185         strhmi r4, [r0], #2
 186         tst     r2, #0x1
 187         ldrbne r3, [r1]                         /*  last byte  */
 188         strbne r3, [r0]
 189
 190         /* we're done! restore everything and return */
 191 1:      ldmfd   sp!, {r5-r11}
 192         ldmfd   sp!, {r0, r4, lr}
 193         bx      lr
 194
 195         /********************************************************************/
 196
 197 non_congruent:
 198         /*
 199          * here source is aligned to 4 bytes
 200          * but destination is not.
 201          *
 202          * in the code below r2 is the number of bytes read
 203          * (the number of bytes written is always smaller, because we have
 204          * partial words in the shift queue)
 205          */
 206         cmp     r2, #4
 207         blo     copy_last_3_and_return
 208
 209         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
 210          * frame. Don't update sp.
 211          */
 212         stmea   sp, {r5-r11}
 213
 214         /* compute shifts needed to align src to dest */
 215         rsb     r5, r0, #0
 216         and     r5, r5, #3                      /* r5 = # bytes in partial words */
 217         mov     r12, r5, lsl #3         /* r12 = right */
 218         rsb     lr, r12, #32            /* lr = left  */
 219
 220         /* read the first word */
 221         ldr     r3, [r1], #4
 222         sub     r2, r2, #4
 223
 224         /* write a partial word (0 to 3 bytes), such that destination
 225          * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 226          */
 227         movs    r5, r5, lsl #31
 228
 229 #if __ARMEB__
 230         movmi   r3, r3, ror #24
 231         strbmi  r3, [r0], #1
 232         movcs   r3, r3, ror #24
 233         strbcs  r3, [r0], #1
 234         movcs   r3, r3, ror #24
 235         strbcs  r3, [r0], #1
 236 #else
 237         strbmi r3, [r0], #1
 238         movmi   r3, r3, lsr #8
 239         strbcs r3, [r0], #1
 240         movcs   r3, r3, lsr #8
 241         strbcs r3, [r0], #1
 242         movcs   r3, r3, lsr #8
 243 #endif
 244
 245         cmp     r2, #4
 246         blo     partial_word_tail
 247
 248 #if __ARMEB__
 249         mov     r3, r3, lsr r12
 250         mov     r3, r3, lsl r12
 251 #endif
 252
 253         /* Align destination to 32 bytes (cache line boundary) */
 254 1:      tst     r0, #0x1c
 255         beq     2f
 256         ldr     r5, [r1], #4
 257         sub     r2, r2, #4
 258 #if __ARMEB__
 259         mov     r4, r5,                 lsr lr
 260         orr     r4, r4, r3
 261         mov     r3, r5,                 lsl r12
 262 #else
 263         mov     r4, r5,                 lsl lr
 264         orr     r4, r4, r3
 265         mov     r3, r5,                 lsr r12
 266 #endif
 267         str     r4, [r0], #4
 268         cmp     r2, #4
 269         bhs     1b
 270         blo     partial_word_tail
 271
 272         /* copy 32 bytes at a time */
 273 2:      subs    r2, r2, #32
 274         blo     less_than_thirtytwo
 275
 276         /* Use immediate mode for the shifts, because there is an extra cycle
 277          * for register shifts, which could account for up to 50% of
 278          * performance hit.
 279          */
 280
 281         cmp     r12, #24
 282         beq     loop24
 283         cmp     r12, #8
 284         beq     loop8
 285
 286 loop16:
 287         ldr     r12, [r1], #4
 288 1:      mov     r4, r12
 289         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 290         subs    r2, r2, #32
 291         ldrhs   r12, [r1], #4
 292 #if __ARMEB__
 293         orr     r3, r3, r4, lsr #16
 294         mov     r4, r4, lsl #16
 295         orr     r4, r4, r5, lsr #16
 296         mov     r5, r5, lsl #16
 297         orr     r5, r5, r6, lsr #16
 298         mov     r6, r6, lsl #16
 299         orr     r6, r6, r7, lsr #16
 300         mov     r7, r7, lsl #16
 301         orr     r7, r7, r8, lsr #16
 302         mov     r8, r8, lsl #16
 303         orr     r8, r8, r9, lsr #16
 304         mov     r9, r9, lsl #16
 305         orr     r9, r9, r10, lsr #16
 306         mov     r10, r10,               lsl #16
 307         orr     r10, r10, r11, lsr #16
 308         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 309         mov     r3, r11, lsl #16
 310 #else
 311         orr     r3, r3, r4, lsl #16
 312         mov     r4, r4, lsr #16
 313         orr     r4, r4, r5, lsl #16
 314         mov     r5, r5, lsr #16
 315         orr     r5, r5, r6, lsl #16
 316         mov     r6, r6, lsr #16
 317         orr     r6, r6, r7, lsl #16
 318         mov     r7, r7, lsr #16
 319         orr     r7, r7, r8, lsl #16
 320         mov     r8, r8, lsr #16
 321         orr     r8, r8, r9, lsl #16
 322         mov     r9, r9, lsr #16
 323         orr     r9, r9, r10, lsl #16
 324         mov     r10, r10,               lsr #16
 325         orr     r10, r10, r11, lsl #16
 326         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 327         mov     r3, r11, lsr #16
 328 #endif
 329         bhs     1b
 330         b       less_than_thirtytwo
 331
 332 loop8:
 333         ldr     r12, [r1], #4
 334 1:      mov     r4, r12
 335         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 336         subs    r2, r2, #32
 337         ldrhs   r12, [r1], #4
 338 #if __ARMEB__
 339         orr     r3, r3, r4, lsr #24
 340         mov     r4, r4, lsl #8
 341         orr     r4, r4, r5, lsr #24
 342         mov     r5, r5, lsl #8
 343         orr     r5, r5, r6, lsr #24
 344         mov     r6, r6,  lsl #8
 345         orr     r6, r6, r7, lsr #24
 346         mov     r7, r7,  lsl #8
 347         orr     r7, r7, r8,             lsr #24
 348         mov     r8, r8,  lsl #8
 349         orr     r8, r8, r9,             lsr #24
 350         mov     r9, r9,  lsl #8
 351         orr     r9, r9, r10,    lsr #24
 352         mov     r10, r10, lsl #8
 353         orr     r10, r10, r11,  lsr #24
 354         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 355         mov     r3, r11, lsl #8
 356 #else
 357         orr     r3, r3, r4, lsl #24
 358         mov     r4, r4, lsr #8
 359         orr     r4, r4, r5, lsl #24
 360         mov     r5, r5, lsr #8
 361         orr     r5, r5, r6, lsl #24
 362         mov     r6, r6,  lsr #8
 363         orr     r6, r6, r7, lsl #24
 364         mov     r7, r7,  lsr #8
 365         orr     r7, r7, r8,             lsl #24
 366         mov     r8, r8,  lsr #8
 367         orr     r8, r8, r9,             lsl #24
 368         mov     r9, r9,  lsr #8
 369         orr     r9, r9, r10,    lsl #24
 370         mov     r10, r10, lsr #8
 371         orr     r10, r10, r11,  lsl #24
 372         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 373         mov     r3, r11, lsr #8
 374 #endif
 375         bhs     1b
 376         b       less_than_thirtytwo
 377
 378 loop24:
 379         ldr     r12, [r1], #4
 380 1:      mov     r4, r12
 381         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 382         subs    r2, r2, #32
 383         ldrhs   r12, [r1], #4
 384 #if __ARMEB__
 385         orr     r3, r3, r4, lsr #8
 386         mov     r4, r4, lsl #24
 387         orr     r4, r4, r5, lsr #8
 388         mov     r5, r5, lsl #24
 389         orr     r5, r5, r6, lsr #8
 390         mov     r6, r6, lsl #24
 391         orr     r6, r6, r7, lsr #8
 392         mov     r7, r7, lsl #24
 393         orr     r7, r7, r8, lsr #8
 394         mov     r8, r8, lsl #24
 395         orr     r8, r8, r9, lsr #8
 396         mov     r9, r9, lsl #24
 397         orr     r9, r9, r10, lsr #8
 398         mov     r10, r10, lsl #24
 399         orr     r10, r10, r11, lsr #8
 400         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 401         mov     r3, r11, lsl #24
 402 #else
 403         orr     r3, r3, r4, lsl #8
 404         mov     r4, r4, lsr #24
 405         orr     r4, r4, r5, lsl #8
 406         mov     r5, r5, lsr #24
 407         orr     r5, r5, r6, lsl #8
 408         mov     r6, r6, lsr #24
 409         orr     r6, r6, r7, lsl #8
 410         mov     r7, r7, lsr #24
 411         orr     r7, r7, r8, lsl #8
 412         mov     r8, r8, lsr #24
 413         orr     r8, r8, r9, lsl #8
 414         mov     r9, r9, lsr #24
 415         orr     r9, r9, r10, lsl #8
 416         mov     r10, r10, lsr #24
 417         orr     r10, r10, r11, lsl #8
 418         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 419         mov     r3, r11, lsr #24
 420 #endif
 421         bhs     1b
 422
 423 less_than_thirtytwo:
 424         /* copy the last 0 to 31 bytes of the source */
 425         rsb     r12, lr, #32            /* we corrupted r12, recompute it  */
 426         add     r2, r2, #32
 427         cmp     r2, #4
 428         blo     partial_word_tail
 429
 430 1:      ldr     r5, [r1], #4
 431         sub     r2, r2, #4
 432 #if __ARMEB__
 433         mov     r4, r5,                 lsr lr
 434         orr     r4, r4, r3
 435         mov     r3,     r5,                     lsl r12
 436 #else
 437         mov     r4, r5,                 lsl lr
 438         orr     r4, r4, r3
 439         mov     r3,     r5,                     lsr r12
 440 #endif
 441         str     r4, [r0], #4
 442         cmp     r2, #4
 443         bhs     1b
 444
 445 partial_word_tail:
 446         /* we have a partial word in the input buffer */
 447         movs    r5, lr, lsl #(31-3)
 448 #if __ARMEB__
 449         movmi   r3, r3, ror #24
 450         strbmi r3, [r0], #1
 451         movcs   r3, r3, ror #24
 452         strbcs r3, [r0], #1
 453         movcs   r3, r3, ror #24
 454         strbcs r3, [r0], #1
 455 #else
 456         strbmi r3, [r0], #1
 457         movmi   r3, r3, lsr #8
 458         strbcs r3, [r0], #1
 459         movcs   r3, r3, lsr #8
 460         strbcs r3, [r0], #1
 461 #endif
 462
 463         /* Refill spilled registers from the stack. Don't update sp. */
 464         ldmfd   sp, {r5-r11}
 465
 466 copy_last_3_and_return:
 467         movs    r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
 468         ldrbmi r2, [r1], #1
 469         ldrbcs r3, [r1], #1
 470         ldrbcs r12,[r1]
 471         strbmi r2, [r0], #1
 472         strbcs r3, [r0], #1
 473         strbcs r12,[r0]
 474
 475         /* we're done! restore sp and spilled registers and return */
 476         add     sp,  sp, #28
 477         ldmfd   sp!, {r0, r4, lr}
 478         bx      lr
 479