9 files changed, 389 insertions, 44 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index bba5db6..ccef6ae 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -4,7 +4,7 @@
 
 CFLAGS_csum-partial.o := -funroll-loops
 
-obj-y := io.o
+obj-y := io.o iomap_copy.o
 
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 43d9fa1..1f81b79 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,8 +5,46 @@
 	.globl clear_page
 	.p2align 4
 clear_page:
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+	movq %rax,(%rdi)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	leaq	64(%rdi),%rdi
+	jnz	.Lloop
+	nop
+	ret
+clear_page_end:
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  clear_page
+	.quad  clear_page_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  clear_page_end-clear_page
+	.byte  clear_page_c_end-clear_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+clear_page_c:
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep 
 	stosq
 	ret
+clear_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 621a197..8fa19d9 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,7 +8,94 @@
 	.globl copy_page
 	.p2align 4
 copy_page:
+	subq	$3*8,%rsp
+	movq	%rbx,(%rsp)
+	movq	%r12,1*8(%rsp)
+	movq	%r13,2*8(%rsp)
+
+	movl	$(4096/64)-5,%ecx
+	.p2align 4
+.Loop64:
+  	dec     %rcx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	prefetcht0 5*64(%rsi)
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq    64 (%rsi), %rsi
+	leaq    64 (%rdi), %rdi
+
+	jnz     .Loop64
+
+	movl	$5,%ecx
+	.p2align 4
+.Loop2:
+	decl   %ecx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq	64(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+
+	jnz	.Loop2
+
+	movq	(%rsp),%rbx
+	movq	1*8(%rsp),%r12
+	movq	2*8(%rsp),%r13
+	addq	$3*8,%rsp
+	ret
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_c_end-copy_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+copy_page_c:
 	movl $4096/8,%ecx
 	rep 
 	movsq 
 	ret
+copy_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index dfa358b..f64569b 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -5,7 +5,7 @@
  */		 
 
 #define FIX_ALIGNMENT 1
-		
+
 	#include <asm/current.h>
 	#include <asm/asm-offsets.h>
 	#include <asm/thread_info.h>
@@ -21,7 +21,7 @@ copy_to_user:
 	jc  bad_to_user
 	cmpq threadinfo_addr_limit(%rax),%rcx
 	jae bad_to_user
-2:	
+2:
 	.byte 0xe9	/* 32bit jump */
 	.long .Lcug-1f
 1:
@@ -34,7 +34,7 @@ copy_to_user:
 	.align 8
 	.quad  2b
 	.quad  3b
-	.byte  X86_FEATURE_K8_C
+	.byte  X86_FEATURE_REP_GOOD
 	.byte  5
 	.byte  5
 	.previous
@@ -75,12 +75,12 @@ bad_to_user:
  * Output:		
  * eax uncopied bytes or 0 if successful.
  */
-	.globl copy_user_generic	
+	.globl copy_user_generic
 	.p2align 4
-copy_user_generic:	
-	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */	
+copy_user_generic:
+	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */
 	.byte 0x66,0x90
-1:		
+1:
 	.section .altinstr_replacement,"ax"
 2:	.byte 0xe9	             /* near jump with 32bit immediate */
 	.long copy_user_generic_c-1b /* offset */
@@ -89,11 +89,11 @@ copy_user_generic:
 	.align 8
 	.quad  copy_user_generic
 	.quad  2b
-	.byte  X86_FEATURE_K8_C
+	.byte  X86_FEATURE_REP_GOOD
 	.byte  5
 	.byte  5
 	.previous
-.Lcug:	
+.Lcug:
 	pushq %rbx
 	xorl %eax,%eax		/*zero for the exception handler */
 
@@ -107,11 +107,11 @@ copy_user_generic:
 
 	movq %rdx,%rcx
 
-	movl $64,%ebx	
+	movl $64,%ebx
 	shrq $6,%rdx
 	decq %rdx
 	js   .Lhandle_tail
-	
+
 	.p2align 4
 .Lloop:
 .Ls1:	movq (%rsi),%r11
@@ -122,7 +122,7 @@ copy_user_generic:
 .Ld2:	movq %r8,1*8(%rdi)
 .Ld3:	movq %r9,2*8(%rdi)
 .Ld4:	movq %r10,3*8(%rdi)
-		
+
 .Ls5:	movq 4*8(%rsi),%r11
 .Ls6:	movq 5*8(%rsi),%r8
 .Ls7:	movq 6*8(%rsi),%r9
@@ -131,12 +131,12 @@ copy_user_generic:
 .Ld6:	movq %r8,5*8(%rdi)
 .Ld7:	movq %r9,6*8(%rdi)
 .Ld8:	movq %r10,7*8(%rdi)
-	
+
 	decq %rdx
 
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
-	
+
 	jns  .Lloop
 
 	.p2align 4
@@ -154,9 +154,9 @@ copy_user_generic:
 	leaq 8(%rdi),%rdi
 	leaq 8(%rsi),%rsi
 	jnz .Lloop_8
-	
-.Lhandle_7:		
-	movl %edx,%ecx	
+
+.Lhandle_7:
+	movl %edx,%ecx
 	andl $7,%ecx
 	jz   .Lende
 	.p2align 4
@@ -167,12 +167,12 @@ copy_user_generic:
 	incq %rsi
 	decl %ecx
 	jnz .Lloop_1
-			
+
 .Lende:
 	popq %rbx
-	ret	
+	ret
 
-#ifdef FIX_ALIGNMENT		  		
+#ifdef FIX_ALIGNMENT
 	/* align destination */
 	.p2align 4
 .Lbad_alignment:
@@ -182,7 +182,7 @@ copy_user_generic:
 	cmpq %r9,%rdx
 	jz   .Lhandle_7
 	js   .Lhandle_7
-.Lalign_1:		
+.Lalign_1:
 .Ls11:	movb (%rsi),%bl
 .Ld11:	movb %bl,(%rdi)
 	incq %rsi
@@ -192,14 +192,14 @@ copy_user_generic:
 	subq %r9,%rdx
 	jmp .Lafter_bad_alignment
 #endif
-	
-	/* table sorted by exception address */	
+
+	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
 	.quad .Ls1,.Ls1e
 	.quad .Ls2,.Ls2e
 	.quad .Ls3,.Ls3e
-	.quad .Ls4,.Ls4e	
+	.quad .Ls4,.Ls4e
 	.quad .Ld1,.Ls1e
 	.quad .Ld2,.Ls2e
 	.quad .Ld3,.Ls3e
@@ -207,7 +207,7 @@ copy_user_generic:
 	.quad .Ls5,.Ls5e
 	.quad .Ls6,.Ls6e
 	.quad .Ls7,.Ls7e
-	.quad .Ls8,.Ls8e	
+	.quad .Ls8,.Ls8e
 	.quad .Ld5,.Ls5e
 	.quad .Ld6,.Ls6e
 	.quad .Ld7,.Ls7e
@@ -216,16 +216,16 @@ copy_user_generic:
 	.quad .Ld9,.Le_quad
 	.quad .Ls10,.Le_byte
 	.quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT	
+#ifdef FIX_ALIGNMENT
 	.quad .Ls11,.Lzero_rest
 	.quad .Ld11,.Lzero_rest
 #endif
 	.quad .Le5,.Le_zero
 	.previous
 
-	/* compute 64-offset for main loop. 8 bytes accuracy with error on the 
-	   pessimistic side. this is gross. it would be better to fix the 
-	   interface. */	
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
+	   pessimistic side. this is gross. it would be better to fix the
+	   interface. */
 	/* eax: zero, ebx: 64 */
 .Ls1e: 	addl $8,%eax
 .Ls2e: 	addl $8,%eax
@@ -255,25 +255,32 @@ copy_user_generic:
 	movq %rdx,%rcx
 .Le_byte:
 	xorl %eax,%eax
-.Le5:	rep 
+.Le5:	rep
 	stosb
 	/* when there is another exception while zeroing the rest just return */
-.Le_zero:		
+.Le_zero:
 	movq %rdx,%rax
 	jmp .Lende
 
-	/* C stepping K8 run faster using the string copy instructions.
+	/* Some CPUs run faster using the string copy instructions.
 	   This is also a lot simpler. Use them when possible.
 	   Patch in jmps to this code instead of copying it fully
 	   to avoid unwanted aliasing in the exception tables. */
-		
+
  /* rdi	destination
   * rsi source
   * rdx count
   *
-  * Output:		
+  * Output:
   * eax uncopied bytes or 0 if successfull.
-  */			
+  *
+  * Only 4GB of copy is supported. This shouldn't be a problem
+  * because the kernel normally only writes from/to page sized chunks
+  * even if user space passed a longer buffer.
+  * And more would be dangerous because both Intel and AMD have
+  * errata with rep movsq > 4GB. If someone feels the need to fix
+  * this please consider this.
+   */
 copy_user_generic_c:
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -287,7 +294,7 @@ copy_user_generic_c:
 	ret
 3:	lea (%rdx,%rcx,8),%rax
 	ret
-	
+
 	.section __ex_table,"a"
 	.quad 1b,3b
 	.quad 2b,4b
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 841bd73..03c460c 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -39,7 +39,7 @@ void __delay(unsigned long loops)
 
 inline void __const_udelay(unsigned long xloops)
 {
-	__delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+	__delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32);
 }
 
 void __udelay(unsigned long usecs)
diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S
new file mode 100644
index 0000000..8bbade5
--- /dev/null
+++ b/arch/x86_64/lib/iomap_copy.S
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2006 PathScale, Inc.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ 	.globl __iowrite32_copy
+	.p2align 4
+__iowrite32_copy:
+	movl %edx,%ecx
+	rep movsd
+	ret
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd805..5554948 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
  * 
  * Output:
  * rax original destination
- * 
- * TODO: check best memcpy for PSC
  */	
 
  	.globl __memcpy
@@ -20,6 +18,95 @@
 	.p2align 4
 __memcpy:
 memcpy:		
+	pushq %rbx
+	movq %rdi,%rax
+
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
+
+.Lhandle_tail:
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8:
+	decl %ecx
+	movq (%rsi),%r8
+	movq %r8,(%rdi)
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz .Lende
+	.p2align 4
+.Lloop_1:
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+.Lende:
+	popq %rbx
+	ret
+.Lfinal:
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memcpy
+	.quad  memcpy_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  .Lfinal-memcpy
+	.byte  memcpy_c_end-memcpy_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  */
+memcpy_c:
 	movq %rdi,%rax
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
 	rep
 	movsb
 	ret
+memcpy_c_end:
+	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f2..ad397f2 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
 	.p2align 4
 memset:	
 __memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -29,3 +121,5 @@ __memset:
 	stosb
 	movq %r9,%rax
 	ret
+memset_c_end:
+	.previous
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
index db8abba..9bc2c29 100644
--- a/arch/x86_64/lib/usercopy.c
+++ b/arch/x86_64/lib/usercopy.c
@@ -109,14 +109,11 @@ unsigned long clear_user(void __user *to, unsigned long n)
  * Return 0 on exception, a value greater than N if too long
  */
 
-long strnlen_user(const char __user *s, long n)
+long __strnlen_user(const char __user *s, long n)
 {
 	long res = 0;
 	char c;
 
-	if (!access_ok(VERIFY_READ, s, n))
-		return 0;
-
 	while (1) {
 		if (res>n)
 			return n+1;
@@ -129,6 +126,13 @@ long strnlen_user(const char __user *s, long n)
 	}
 }
 
+long strnlen_user(const char __user *s, long n)
+{
+	if (!access_ok(VERIFY_READ, s, n))
+		return 0;
+	return __strnlen_user(s, n);
+}
+
 long strlen_user(const char __user *s)
 {
 	long res = 0;