From e7b52ffd45a6d834473f43b349e7d86593d763c7 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:17 +0800 Subject: x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785..39ed567 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1244,7 +1244,8 @@ static void xen_flush_tlb_single(unsigned long addr) } static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { struct { struct mmuext_op op; @@ -1256,7 +1257,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_others(cpus, mm, va); + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1269,11 +1270,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; + args->op.arg1.linear_addr = start; } MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); -- cgit v1.1 From d095d43e78dd811d5c02c25e207c3364019b5a77 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:05 +0100 Subject: xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable In xen_set_pte() if batching is unavailable (because the caller is in an interrupt context such as handling a page fault) it would fall back to using native_set_pte() and trapping and emulating the PTE write. On 32-bit guests this requires two traps for each PTE write (one for each dword of the PTE). Instead, do one mmu_update hypercall directly. During construction of the initial page tables, continue to use native_set_pte() because most of the PTEs being set are in writable and unpinned pages (see phys_pmd_init() in arch/x86/mm/init_64.c) and using a hypercall for this is very expensive. This significantly improves page fault performance in 32-bit PV guests. lmbench3 test Before After Improvement ---------------------------------------------- lat_pagefault 3.18 us 2.32 us 27% lat_proc fork 356 us 313.3 us 11% Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785..3f1783a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { - if (!xen_batched_set_pte(ptep, pteval)) - native_set_pte(ptep, pteval); + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } } static void xen_set_pte(pte_t *ptep, pte_t pteval) @@ -1416,13 +1428,21 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) } #endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { pte = mask_rw_pte(ptep, pte); - xen_set_pte(ptep, pte); + native_set_pte(ptep, pte); } static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -- cgit v1.1 From 66a27dde9ae96e35278983f2e59bea04eb714cd0 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:06 +0100 Subject: xen/mm: zero PTEs for non-present MFNs in the initial page table When constructing the initial page tables, if the MFN for a usable PFN is missing in the p2m then that frame is initially ballooned out. In this case, zero the PTE (as in decrease_reservation() in drivers/xen/balloon.c). This is obviously safe instead of having an valid PTE with an MFN of INVALID_P2M_ENTRY (~0). Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f1783a..27336df 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1432,6 +1432,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) * Init-time set_pte while constructing initial pagetables, which * doesn't allow RO page table pages to be remapped RW. * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * * Many of these PTE updates are done on unpinned and writable pages * and doing a hypercall for these is unnecessary and expensive. At * this point it is not possible to tell if a page is pinned or not, @@ -1440,7 +1444,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { - pte = mask_rw_pte(ptep, pte); + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); native_set_pte(ptep, pte); } -- cgit v1.1 From ce7184bdbd38d920fb515266fbbdc585ad2e5493 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 24 Aug 2012 08:55:13 +0000 Subject: xen: fix logical error in tlb flushing While TLB_FLUSH_ALL gets passed as 'end' argument to flush_tlb_others(), the Xen code was made to check its 'start' parameter. That may give a incorrect op.cmd to MMUEXT_INVLPG_MULTI instead of MMUEXT_TLB_FLUSH_MULTI. Then it causes some page can not be flushed from TLB. This patch fixed this issue. Reported-by: Jan Beulich Signed-off-by: Alex Shi Acked-by: Jan Beulich Tested-by: Yongjie Ren Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a761..5141d80 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; args->op.arg1.linear_addr = start; } -- cgit v1.1 From 73090f8993a40a2f67fed1ab866a928c68cd3765 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:37 +0100 Subject: x86: Remove base argument from x86_init.paging.pagetable_setup_start We either use swapper_pg_dir or the argument is unused. Preparatory patch to simplify platform pagetable setup further. Signed-off-by: Attilio Rao Ackedb-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-2-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5141d80..32e66c8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(pgd_t *base) +static void __init xen_pagetable_setup_start(void) { } -- cgit v1.1 From 7737b215ad0f94d20a87d98315da9f6cadaf35c9 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:38 +0100 Subject: x86: Rename pagetable_setup_start() to pagetable_init() In preparation for unifying the pagetable_setup_start() and pagetable_setup_done() setup functions, rename appropriately all the infrastructure related to pagetable_setup_start(). Signed-off-by: Attilio Rao Ackedd-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-3-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 32e66c8..624efbe 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(void) +static void __init xen_pagetable_init(void) { } @@ -2068,7 +2068,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; - x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; -- cgit v1.1 From 843b8ed2ec598aae5e3516b21957ede62a070e36 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:39 +0100 Subject: x86: Move paging_init() call to x86_init.paging.pagetable_init() Move the paging_init() call to the platform specific pagetable_init() function, so we can get rid of the extra pagetable_setup_done() function pointer. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-4-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 624efbe..c2ff7ea 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1176,6 +1176,7 @@ static void xen_exit_mmap(struct mm_struct *mm) static void __init xen_pagetable_init(void) { + paging_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -- cgit v1.1 From c711288727a62f74d48032e56e51333dd104bf58 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:40 +0100 Subject: x86: xen: Cleanup and remove x86_init.paging.pagetable_setup_done() At this stage x86_init.paging.pagetable_setup_done is only used in the XEN case. Move its content in the x86_init.paging.pagetable_init setup function and remove the now unused x86_init.paging.pagetable_setup_done remaining infrastructure. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-5-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c2ff7ea..7a769b7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,9 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } +static void xen_post_allocator_init(void); + static void __init xen_pagetable_init(void) { paging_init(); + xen_setup_shared_info(); + xen_post_allocator_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) @@ -1193,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) } } -static void xen_post_allocator_init(void); - -static void __init xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); - xen_post_allocator_init(); -} - static void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -2070,7 +2066,6 @@ void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); -- cgit v1.1