diff options
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r-- | arch/x86_64/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/x86_64/lib/clear_page.S | 38 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_page.S | 87 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_user.S | 79 | ||||
-rw-r--r-- | arch/x86_64/lib/delay.c | 2 | ||||
-rw-r--r-- | arch/x86_64/lib/iomap_copy.S | 26 | ||||
-rw-r--r-- | arch/x86_64/lib/memcpy.S | 93 | ||||
-rw-r--r-- | arch/x86_64/lib/memset.S | 94 | ||||
-rw-r--r-- | arch/x86_64/lib/usercopy.c | 12 |
9 files changed, 389 insertions, 44 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index bba5db6..ccef6ae 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS_csum-partial.o := -funroll-loops -obj-y := io.o +obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 43d9fa1..1f81b79 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -5,8 +5,46 @@ .globl clear_page .p2align 4 clear_page: + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret +clear_page_end: + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad clear_page_c + .byte X86_FEATURE_REP_GOOD + .byte clear_page_end-clear_page + .byte clear_page_c_end-clear_page_c + .previous + + .section .altinstr_replacement,"ax" +clear_page_c: movl $4096/8,%ecx xorl %eax,%eax rep stosq ret +clear_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 621a197..8fa19d9 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -8,7 +8,94 @@ .globl copy_page .p2align 4 copy_page: + subq $3*8,%rsp + movq %rbx,(%rsp) + movq %r12,1*8(%rsp) + movq %r13,2*8(%rsp) + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + movq 1*8(%rsp),%r12 + movq 2*8(%rsp),%r13 + addq $3*8,%rsp + ret + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad copy_page_c + .byte X86_FEATURE_REP_GOOD + .byte copy_page_c_end-copy_page_c + .byte copy_page_c_end-copy_page_c + .previous + + .section .altinstr_replacement,"ax" +copy_page_c: movl $4096/8,%ecx rep movsq ret +copy_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index dfa358b..f64569b 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -5,7 +5,7 @@ */ #define FIX_ALIGNMENT 1 - + #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -21,7 +21,7 @@ copy_to_user: jc bad_to_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_to_user -2: +2: .byte 0xe9 /* 32bit jump */ .long .Lcug-1f 1: @@ -34,7 +34,7 @@ copy_to_user: .align 8 .quad 2b .quad 3b - .byte X86_FEATURE_K8_C + .byte X86_FEATURE_REP_GOOD .byte 5 .byte 5 .previous @@ -75,12 +75,12 @@ bad_to_user: * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic + .globl copy_user_generic .p2align 4 -copy_user_generic: - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ +copy_user_generic: + .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ .byte 0x66,0x90 -1: +1: .section .altinstr_replacement,"ax" 2: .byte 0xe9 /* near jump with 32bit immediate */ .long copy_user_generic_c-1b /* offset */ @@ -89,11 +89,11 @@ copy_user_generic: .align 8 .quad copy_user_generic .quad 2b - .byte X86_FEATURE_K8_C + .byte X86_FEATURE_REP_GOOD .byte 5 .byte 5 .previous -.Lcug: +.Lcug: pushq %rbx xorl %eax,%eax /*zero for the exception handler */ @@ -107,11 +107,11 @@ copy_user_generic: movq %rdx,%rcx - movl $64,%ebx + movl $64,%ebx shrq $6,%rdx decq %rdx js .Lhandle_tail - + .p2align 4 .Lloop: .Ls1: movq (%rsi),%r11 @@ -122,7 +122,7 @@ copy_user_generic: .Ld2: movq %r8,1*8(%rdi) .Ld3: movq %r9,2*8(%rdi) .Ld4: movq %r10,3*8(%rdi) - + .Ls5: movq 4*8(%rsi),%r11 .Ls6: movq 5*8(%rsi),%r8 .Ls7: movq 6*8(%rsi),%r9 @@ -131,12 +131,12 @@ copy_user_generic: .Ld6: movq %r8,5*8(%rdi) .Ld7: movq %r9,6*8(%rdi) .Ld8: movq %r10,7*8(%rdi) - + decq %rdx leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - + jns .Lloop .p2align 4 @@ -154,9 +154,9 @@ copy_user_generic: leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx + +.Lhandle_7: + movl %edx,%ecx andl $7,%ecx jz .Lende .p2align 4 @@ -167,12 +167,12 @@ copy_user_generic: incq %rsi decl %ecx jnz .Lloop_1 - + .Lende: popq %rbx - ret + ret -#ifdef FIX_ALIGNMENT +#ifdef FIX_ALIGNMENT /* align destination */ .p2align 4 .Lbad_alignment: @@ -182,7 +182,7 @@ copy_user_generic: cmpq %r9,%rdx jz .Lhandle_7 js .Lhandle_7 -.Lalign_1: +.Lalign_1: .Ls11: movb (%rsi),%bl .Ld11: movb %bl,(%rdi) incq %rsi @@ -192,14 +192,14 @@ copy_user_generic: subq %r9,%rdx jmp .Lafter_bad_alignment #endif - - /* table sorted by exception address */ + + /* table sorted by exception address */ .section __ex_table,"a" .align 8 .quad .Ls1,.Ls1e .quad .Ls2,.Ls2e .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e + .quad .Ls4,.Ls4e .quad .Ld1,.Ls1e .quad .Ld2,.Ls2e .quad .Ld3,.Ls3e @@ -207,7 +207,7 @@ copy_user_generic: .quad .Ls5,.Ls5e .quad .Ls6,.Ls6e .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e + .quad .Ls8,.Ls8e .quad .Ld5,.Ls5e .quad .Ld6,.Ls6e .quad .Ld7,.Ls7e @@ -216,16 +216,16 @@ copy_user_generic: .quad .Ld9,.Le_quad .quad .Ls10,.Le_byte .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT +#ifdef FIX_ALIGNMENT .quad .Ls11,.Lzero_rest .quad .Ld11,.Lzero_rest #endif .quad .Le5,.Le_zero .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ /* eax: zero, ebx: 64 */ .Ls1e: addl $8,%eax .Ls2e: addl $8,%eax @@ -255,25 +255,32 @@ copy_user_generic: movq %rdx,%rcx .Le_byte: xorl %eax,%eax -.Le5: rep +.Le5: rep stosb /* when there is another exception while zeroing the rest just return */ -.Le_zero: +.Le_zero: movq %rdx,%rax jmp .Lende - /* C stepping K8 run faster using the string copy instructions. + /* Some CPUs run faster using the string copy instructions. This is also a lot simpler. Use them when possible. Patch in jmps to this code instead of copying it fully to avoid unwanted aliasing in the exception tables. */ - + /* rdi destination * rsi source * rdx count * - * Output: + * Output: * eax uncopied bytes or 0 if successfull. - */ + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + */ copy_user_generic_c: movl %edx,%ecx shrl $3,%ecx @@ -287,7 +294,7 @@ copy_user_generic_c: ret 3: lea (%rdx,%rcx,8),%rax ret - + .section __ex_table,"a" .quad 1b,3b .quad 2b,4b diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 841bd73..03c460c 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -39,7 +39,7 @@ void __delay(unsigned long loops) inline void __const_udelay(unsigned long xloops) { - __delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); + __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32); } void __udelay(unsigned long usecs) diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S new file mode 100644 index 0000000..8bbade5 --- /dev/null +++ b/arch/x86_64/lib/iomap_copy.S @@ -0,0 +1,26 @@ +/* + * Copyright 2006 PathScale, Inc. All Rights Reserved. + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* + * override generic version in lib/iomap_copy.c + */ + .globl __iowrite32_copy + .p2align 4 +__iowrite32_copy: + movl %edx,%ecx + rep movsd + ret diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 92dd805..5554948 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -11,8 +11,6 @@ * * Output: * rax original destination - * - * TODO: check best memcpy for PSC */ .globl __memcpy @@ -20,6 +18,95 @@ .p2align 4 __memcpy: memcpy: + pushq %rbx + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret +.Lfinal: + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad memcpy_c + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal-memcpy + .byte memcpy_c_end-memcpy_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi source + * rdx count + */ +memcpy_c: movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx @@ -30,3 +117,5 @@ memcpy: rep movsb ret +memcpy_c_end: + .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f2..ad397f2 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -13,6 +13,98 @@ .p2align 4 memset: __memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,3 +121,5 @@ __memset: stosb movq %r9,%rax ret +memset_c_end: + .previous diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c index db8abba..9bc2c29 100644 --- a/arch/x86_64/lib/usercopy.c +++ b/arch/x86_64/lib/usercopy.c @@ -109,14 +109,11 @@ unsigned long clear_user(void __user *to, unsigned long n) * Return 0 on exception, a value greater than N if too long */ -long strnlen_user(const char __user *s, long n) +long __strnlen_user(const char __user *s, long n) { long res = 0; char c; - if (!access_ok(VERIFY_READ, s, n)) - return 0; - while (1) { if (res>n) return n+1; @@ -129,6 +126,13 @@ long strnlen_user(const char __user *s, long n) } } +long strnlen_user(const char __user *s, long n) +{ + if (!access_ok(VERIFY_READ, s, n)) + return 0; + return __strnlen_user(s, n); +} + long strlen_user(const char __user *s) { long res = 0; |