From e7b52ffd45a6d834473f43b349e7d86593d763c7 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Thu, 28 Jun 2012 09:02:17 +0800
Subject: x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range

x86 has no flush_tlb_range support in instruction level. Currently the
flush_tlb_range just implemented by flushing all page table. That is not
the best solution for all scenarios. In fact, if we just use 'invlpg' to
flush few lines from TLB, we can get the performance gain from later
remain TLB lines accessing.

But the 'invlpg' instruction costs much of time. Its execution time can
compete with cr3 rewriting, and even a bit more on SNB CPU.

So, on a 512 4KB TLB entries CPU, the balance points is at:
	(512 - X) * 100ns(assumed TLB refill cost) =
		X(TLB flush entries) * 100ns(assumed invlpg cost)

Here, X is 256, that is 1/2 of 512 entries.

But with the mysterious CPU pre-fetcher and page miss handler Unit, the
assumed TLB refill cost is far lower then 100ns in sequential access. And
2 HT siblings in one core makes the memory access more faster if they are
accessing the same memory. So, in the patch, I just do the change when
the target entries is less than 1/16 of whole active tlb entries.
Actually, I have no data support for the percentage '1/16', so any
suggestions are welcomed.

As to hugetlb, guess due to smaller page table, and smaller active TLB
entries, I didn't see benefit via my benchmark, so no optimizing now.

My micro benchmark show in ideal scenarios, the performance improves 70
percent in reading. And in worst scenario, the reading/writing
performance is similar with unpatched 3.4-rc4 kernel.

Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP
'always':

multi thread testing, '-t' paramter is thread number:
	       	        with patch   unpatched 3.4-rc4
./mprotect -t 1           14ns		24ns
./mprotect -t 2           13ns		22ns
./mprotect -t 4           12ns		19ns
./mprotect -t 8           14ns		16ns
./mprotect -t 16          28ns		26ns
./mprotect -t 32          54ns		51ns
./mprotect -t 128         200ns		199ns

Single process with sequencial flushing and memory accessing:

		       	with patch   unpatched 3.4-rc4
./mprotect		    7ns			11ns
./mprotect -p 4096  -l 8 -n 10240
			    21ns		21ns

[ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com
  has additional performance numbers. ]

Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/xen/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785..39ed567 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1244,7 +1244,8 @@ static void xen_flush_tlb_single(unsigned long addr)
 }
 
 static void xen_flush_tlb_others(const struct cpumask *cpus,
-				 struct mm_struct *mm, unsigned long va)
+				 struct mm_struct *mm, unsigned long start,
+				 unsigned long end)
 {
 	struct {
 		struct mmuext_op op;
@@ -1256,7 +1257,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
 
 	if (cpumask_empty(cpus))
 		return;		/* nothing to do */
@@ -1269,11 +1270,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
 
-	if (va == TLB_FLUSH_ALL) {
-		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-	} else {
+	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
 		args->op.cmd = MMUEXT_INVLPG_MULTI;
-		args->op.arg1.linear_addr = va;
+		args->op.arg1.linear_addr = start;
 	}
 
 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-- 
cgit v1.1


From d095d43e78dd811d5c02c25e207c3364019b5a77 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:05 +0100
Subject: xen/mm: do direct hypercall in xen_set_pte() if batching is
 unavailable

In xen_set_pte() if batching is unavailable (because the caller is in
an interrupt context such as handling a page fault) it would fall back
to using native_set_pte() and trapping and emulating the PTE write.

On 32-bit guests this requires two traps for each PTE write (one for
each dword of the PTE).  Instead, do one mmu_update hypercall
directly.

During construction of the initial page tables, continue to use
native_set_pte() because most of the PTEs being set are in writable
and unpinned pages (see phys_pmd_init() in arch/x86/mm/init_64.c) and
using a hypercall for this is very expensive.

This significantly improves page fault performance in 32-bit PV
guests.

lmbench3 test  Before    After     Improvement
----------------------------------------------
lat_pagefault  3.18 us   2.32 us   27%
lat_proc fork  356 us    313.3 us  11%

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785..3f1783a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 
 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-	if (!xen_batched_set_pte(ptep, pteval))
-		native_set_pte(ptep, pteval);
+	if (!xen_batched_set_pte(ptep, pteval)) {
+		/*
+		 * Could call native_set_pte() here and trap and
+		 * emulate the PTE write but with 32-bit guests this
+		 * needs two traps (one for each of the two 32-bit
+		 * words in the PTE) so do one hypercall directly
+		 * instead.
+		 */
+		struct mmu_update u;
+
+		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+		u.val = pte_val_ma(pteval);
+		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+	}
 }
 
 static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,21 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 }
 #endif /* CONFIG_X86_64 */
 
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive.  At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
 	pte = mask_rw_pte(ptep, pte);
 
-	xen_set_pte(ptep, pte);
+	native_set_pte(ptep, pte);
 }
 
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-- 
cgit v1.1


From 66a27dde9ae96e35278983f2e59bea04eb714cd0 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:06 +0100
Subject: xen/mm: zero PTEs for non-present MFNs in the initial page table

When constructing the initial page tables, if the MFN for a usable PFN
is missing in the p2m then that frame is initially ballooned out.  In
this case, zero the PTE (as in decrease_reservation() in
drivers/xen/balloon.c).

This is obviously safe instead of having an valid PTE with an MFN of
INVALID_P2M_ENTRY (~0).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f1783a..27336df 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1432,6 +1432,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  * Init-time set_pte while constructing initial pagetables, which
  * doesn't allow RO page table pages to be remapped RW.
  *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
  * Many of these PTE updates are done on unpinned and writable pages
  * and doing a hypercall for these is unnecessary and expensive.  At
  * this point it is not possible to tell if a page is pinned or not,
@@ -1440,7 +1444,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
-	pte = mask_rw_pte(ptep, pte);
+	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+		pte = mask_rw_pte(ptep, pte);
+	else
+		pte = __pte_ma(0);
 
 	native_set_pte(ptep, pte);
 }
-- 
cgit v1.1


From ce7184bdbd38d920fb515266fbbdc585ad2e5493 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Fri, 24 Aug 2012 08:55:13 +0000
Subject: xen: fix logical error in tlb flushing

While TLB_FLUSH_ALL gets passed as 'end' argument to
flush_tlb_others(), the Xen code was made to check its 'start'
parameter. That may give a incorrect op.cmd to MMUEXT_INVLPG_MULTI
instead of MMUEXT_TLB_FLUSH_MULTI. Then it causes some page can not
be flushed from TLB.

This patch fixed this issue.

Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Yongjie Ren <yongjie.ren@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b65a761..5141d80 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
 
 	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-	if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
 		args->op.cmd = MMUEXT_INVLPG_MULTI;
 		args->op.arg1.linear_addr = start;
 	}
-- 
cgit v1.1


From 73090f8993a40a2f67fed1ab866a928c68cd3765 Mon Sep 17 00:00:00 2001
From: Attilio Rao <attilio.rao@citrix.com>
Date: Tue, 21 Aug 2012 21:22:37 +0100
Subject: x86: Remove base argument from x86_init.paging.pagetable_setup_start

We either use swapper_pg_dir or the argument is unused. Preparatory
patch to simplify platform pagetable setup further.

Signed-off-by: Attilio Rao <attilio.rao@citrix.com>
Ackedb-by: <konrad.wilk@oracle.com>
Cc: <Ian.Campbell@citrix.com>
Cc: <Stefano.Stabellini@eu.citrix.com>
Cc: <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/1345580561-8506-2-git-send-email-attilio.rao@citrix.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5141d80..32e66c8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
-static void __init xen_pagetable_setup_start(pgd_t *base)
+static void __init xen_pagetable_setup_start(void)
 {
 }
 
-- 
cgit v1.1


From 7737b215ad0f94d20a87d98315da9f6cadaf35c9 Mon Sep 17 00:00:00 2001
From: Attilio Rao <attilio.rao@citrix.com>
Date: Tue, 21 Aug 2012 21:22:38 +0100
Subject: x86: Rename pagetable_setup_start() to pagetable_init()

In preparation for unifying the pagetable_setup_start() and
pagetable_setup_done() setup functions, rename appropriately all the
infrastructure related to pagetable_setup_start().

Signed-off-by: Attilio Rao <attilio.rao@citrix.com>
Ackedd-by: <konrad.wilk@oracle.com>
Cc: <Ian.Campbell@citrix.com>
Cc: <Stefano.Stabellini@eu.citrix.com>
Cc: <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/1345580561-8506-3-git-send-email-attilio.rao@citrix.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 32e66c8..624efbe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
-static void __init xen_pagetable_setup_start(void)
+static void __init xen_pagetable_init(void)
 {
 }
 
@@ -2068,7 +2068,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 void __init xen_init_mmu_ops(void)
 {
 	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
-	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
+	x86_init.paging.pagetable_init = xen_pagetable_init;
 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
 	pv_mmu_ops = xen_mmu_ops;
 
-- 
cgit v1.1


From 843b8ed2ec598aae5e3516b21957ede62a070e36 Mon Sep 17 00:00:00 2001
From: Attilio Rao <attilio.rao@citrix.com>
Date: Tue, 21 Aug 2012 21:22:39 +0100
Subject: x86: Move paging_init() call to x86_init.paging.pagetable_init()

Move the paging_init() call to the platform specific pagetable_init()
function, so we can get rid of the extra pagetable_setup_done()
function pointer.

Signed-off-by: Attilio Rao <attilio.rao@citrix.com>
Acked-by: <konrad.wilk@oracle.com>
Cc: <Ian.Campbell@citrix.com>
Cc: <Stefano.Stabellini@eu.citrix.com>
Cc: <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/1345580561-8506-4-git-send-email-attilio.rao@citrix.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 624efbe..c2ff7ea 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1176,6 +1176,7 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void __init xen_pagetable_init(void)
 {
+	paging_init();
 }
 
 static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-- 
cgit v1.1


From c711288727a62f74d48032e56e51333dd104bf58 Mon Sep 17 00:00:00 2001
From: Attilio Rao <attilio.rao@citrix.com>
Date: Tue, 21 Aug 2012 21:22:40 +0100
Subject: x86: xen: Cleanup and remove x86_init.paging.pagetable_setup_done()

At this stage x86_init.paging.pagetable_setup_done is only used in the
XEN case. Move its content in the x86_init.paging.pagetable_init setup
function and remove the now unused x86_init.paging.pagetable_setup_done
remaining infrastructure.

Signed-off-by: Attilio Rao <attilio.rao@citrix.com>
Acked-by: <konrad.wilk@oracle.com>
Cc: <Ian.Campbell@citrix.com>
Cc: <Stefano.Stabellini@eu.citrix.com>
Cc: <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/1345580561-8506-5-git-send-email-attilio.rao@citrix.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/mmu.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86/xen/mmu.c')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c2ff7ea..7a769b7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1174,9 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
+static void xen_post_allocator_init(void);
+
 static void __init xen_pagetable_init(void)
 {
 	paging_init();
+	xen_setup_shared_info();
+	xen_post_allocator_init();
 }
 
 static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
@@ -1193,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
 	}
 }
 
-static void xen_post_allocator_init(void);
-
-static void __init xen_pagetable_setup_done(pgd_t *base)
-{
-	xen_setup_shared_info();
-	xen_post_allocator_init();
-}
-
 static void xen_write_cr2(unsigned long cr2)
 {
 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
@@ -2070,7 +2066,6 @@ void __init xen_init_mmu_ops(void)
 {
 	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
 	x86_init.paging.pagetable_init = xen_pagetable_init;
-	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
-- 
cgit v1.1