X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=arch%2Fx86_64%2Flib%2Fmemset.S;h=2c5948116bd21731d4a85712fddd00d3943fd728;hb=da4a00f002239f72b0d7d0eeaa3b60100e2b1438;hp=2aa48f24ed1e48974680b5354ede86e826d5f0a4;hpb=1f8fc99300c6247cace008c470f31eb86e9e7802;p=powerpc.git diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed..2c5948116b 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,4 +1,8 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include +#include + /* * ISO C memset - set a memory block to a byte value. * @@ -8,24 +12,122 @@ * * rax original destination */ - .globl __memset - .globl memset - .p2align 4 -memset: -__memset: + ALIGN +memset_c: + CFI_STARTPROC movq %rdi,%r9 movl %edx,%r8d - andl $7,%r8d + andl $7,%r8d movl %edx,%ecx - shrl $3,%ecx + shrl $3,%ecx /* expand byte value */ movzbl %sil,%esi movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep - stosq + mulq %rsi /* with rax, clobbers rdx */ + rep stosq movl %r8d,%ecx - rep - stosb + rep stosb movq %r9,%rax ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment + CFI_REMEMBER_STATE +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + + CFI_RESTORE_STATE +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b + .previous