diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 71 | ||||
-rw-r--r-- | arch/x86/xen/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 454 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 2 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 522 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 79 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 8 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 10 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 2 |
12 files changed, 687 insertions, 469 deletions
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7793851..17c565d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o + grant-table.o suspend.o platform-pci-unplug.o \ + p2m.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 235c0f4..50542ef 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -569,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_disable(); - start = __get_cpu_var(idt_desc).address; - end = start + __get_cpu_var(idt_desc).size + 1; + start = __this_cpu_read(idt_desc.address); + end = start + __this_cpu_read(idt_desc.size) + 1; xen_mc_flush(); @@ -1016,10 +1021,6 @@ static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - stop_other_cpus(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } @@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void) /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { + struct physdev_set_iopl set_iopl; + int rc; pgd_t *pgd; if (!xen_start_info) @@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; + xen_setup_machphys_mapping(); + /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -1169,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void) xen_smp_init(); +#ifdef CONFIG_ACPI_NUMA + /* + * The pages we from Xen are not related to machine pages, so + * any NUMA information the kernel tries to get from ACPI will + * be meaningless. Prevent it from trying. + */ + acpi_numa = -1; +#endif + pgd = (pgd_t *)xen_start_info->pt_base; if (!xen_initial_domain()) @@ -1180,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; local_irq_disable(); - early_boot_irqs_off(); + early_boot_irqs_disabled = true; memblock_init(); @@ -1191,8 +1205,6 @@ asmlinkage void __init xen_start_kernel(void) /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); - init_mm.pgd = pgd; - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 @@ -1202,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void) #else pv_info.kernel_rpl = 0; #endif - /* set the limit of our address space */ xen_reserve_top(); + /* We used to do this in xen_arch_setup, but that is too late on AMD + * were early_cpu_init (run before ->arch_setup()) calls early_amd_init + * which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); @@ -1245,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void) #endif } -static uint32_t xen_cpuid_base(void) -{ - uint32_t base, eax, ebx, ecx, edx; - char signature[13]; - - for (base = 0x40000000; base < 0x40010000; base += 0x100) { - cpuid(base, &eax, &ebx, &ecx, &edx); - *(uint32_t *)(signature + 0) = ebx; - *(uint32_t *)(signature + 4) = ecx; - *(uint32_t *)(signature + 8) = edx; - signature[12] = 0; - - if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) - return base; - } - - return 0; -} - static int init_hvm_pv_info(int *major, int *minor) { uint32_t eax, ebx, ecx, edx, pages, msr, base; @@ -1373,6 +1374,18 @@ static bool __init xen_hvm_platform(void) return true; } +bool xen_hvm_need_lapic(void) +{ + if (xen_pv_domain()) + return false; + if (!xen_hvm_domain()) + return false; + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + return false; + return true; +} +EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); + const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { .name = "Xen HVM", .detect = xen_hvm_platform, diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 9d30105..6a6fe89 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { #endif }; -void __init xen_init_irq_ops() +void __init xen_init_irq_ops(void) { pv_irq_ops = xen_irq_ops; x86_init.irqs.intr_init = xen_init_IRQ; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c237b81..5e92b61 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -/* - * Xen leaves the responsibility for maintaining p2m mappings to the - * guests themselves, but it must also access and update the p2m array - * during suspend/resume when all the pages are reallocated. - * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. - * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... - * - * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. - * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: - * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages - * - * P2M_PER_PAGE depends on the architecture, as a mfn is always - * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. - */ - -unsigned long xen_max_p2m_pfn __read_mostly; - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -static inline unsigned p2m_top_index(unsigned long pfn) -{ - BUG_ON(pfn >= MAX_P2M_PFN); - return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); -} - -static inline unsigned p2m_mid_index(unsigned long pfn) -{ - return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; -} - -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - -static void p2m_top_mfn_init(unsigned long *top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = virt_to_mfn(p2m_mid_missing_mfn); -} - -static void p2m_top_mfn_p_init(unsigned long **top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing_mfn; -} - -static void p2m_mid_init(unsigned long **mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; -} - -static void p2m_mid_mfn_init(unsigned long *mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); -} - -static void p2m_init(unsigned long *p2m) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; -} - -/* - * Build the parallel p2m_top_mfn and p2m_mid_mfn structures - * - * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() - * to allocate memory. - * - * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. - */ -void xen_build_mfn_list_list(void) -{ - unsigned long pfn; - - /* Pre-initialize p2m_top_mfn to be completely missing */ - if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); - - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_p_init(p2m_top_mfn_p); - - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_init(p2m_top_mfn); - } else { - /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); - } - - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - - /* Don't bother allocating any mfn mid levels if - * they're just missing, just update the stored mfn, - * since all could have changed over a migrate. - */ - if (mid == p2m_mid_missing) { - BUG_ON(mididx); - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); - pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; - continue; - } - - if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. - */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - } - - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); - } -} - -void xen_setup_mfn_list_list(void) -{ - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; -} - -/* Set up p2m_top to point to the domain-builder provided p2m pages */ -void __init xen_build_dynamic_phys_to_machine(void) -{ - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned long pfn; - - xen_max_p2m_pfn = max_pfn; - - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); - - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); - - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - } - - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } -} - -unsigned long get_phys_to_machine(unsigned long pfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - return p2m_top[topidx][mididx][idx]; -} -EXPORT_SYMBOL_GPL(get_phys_to_machine); - -static void *alloc_p2m_page(void) -{ - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} - -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * Fully allocate the p2m structure for a given pfn. We need to check - * that both the top and mid levels are allocated, and make sure the - * parallel mfn tree is kept in sync. We may race with other cpus, so - * the new pages are installed with cmpxchg; if we lose the race then - * simply free the page we allocated and use the one that's there. - */ -static bool alloc_p2m(unsigned long pfn) -{ - unsigned topidx, mididx; - unsigned long ***top_p, **mid; - unsigned long *top_mfn_p, *mid_mfn; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - - top_p = &p2m_top[topidx]; - mid = *top_p; - - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) - return false; - - p2m_mid_init(mid); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); - } - - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; - - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; - - p2m_mid_mfn_init(mid_mfn); - - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) - free_p2m_page(mid_mfn); - else - p2m_top_mfn_p[topidx] = mid_mfn; - } - - if (p2m_top[topidx][mididx] == p2m_missing) { - /* p2m leaf page is missing */ - unsigned long *p2m; - - p2m = alloc_p2m_page(); - if (!p2m) - return false; - - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) - free_p2m_page(p2m); - else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -/* Try to install p2m mapping; fail if intermediate bits missing */ -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return true; - } - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - if (p2m_top[topidx][mididx] == p2m_missing) - return mfn == INVALID_P2M_ENTRY; - - p2m_top[topidx][mididx][idx] = mfn; - - return true; -} - -bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) - return false; - - if (!__set_phys_to_machine(pfn, mfn)) - return false; - } - - return true; -} - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); @@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) offset = address & ~PAGE_MASK; return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } +EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); void make_lowmem_page_readonly(void *vaddr) { @@ -2034,6 +1670,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } +void __init xen_setup_machphys_mapping(void) +{ + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } else { + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + } + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); +} + #ifdef CONFIG_X86_64 static void convert_pfn_mfn(void *v) { @@ -2119,44 +1769,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static __init void xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + memcpy(swapper_kernel_pmd, initial_kernel_pmd, + sizeof(pmd_t) * PTRS_PER_PMD); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; - level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - xen_map_identity_early(level2_kernel_pgt, max_pfn); + xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); set_page_prot(empty_zero_page, PAGE_KERNEL_RO); pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); - return swapper_pg_dir; + return initial_page_table; } #endif /* CONFIG_X86_64 */ @@ -2290,7 +1979,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, +#ifdef CONFIG_X86_32 + .write_cr3 = xen_write_cr3_init, +#else .write_cr3 = xen_write_cr3, +#endif .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -2358,8 +2051,6 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; - vmap_lazy_unmap = false; - memset(dummy_mapping, 0xff, PAGE_SIZE); } @@ -2627,7 +2318,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == + (VM_PFNMAP | VM_RESERVED | VM_IO))); rmd.mfn = mfn; rmd.prot = prot; diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9e565da..4ec8035 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -22,7 +22,7 @@ static inline void xen_mc_batch(void) unsigned long flags; /* need to disable interrupts until this entry is complete */ local_irq_save(flags); - __get_cpu_var(xen_mc_irq_flags) = flags; + __this_cpu_write(xen_mc_irq_flags, flags); } static inline struct multicall_space xen_mc_entry(size_t args) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 0000000..fd12d7c --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,522 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/sched.h> + +#include <asm/cache.h> +#include <asm/setup.h> + +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" + +static void __init m2p_override_init(void); + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ +void xen_build_mfn_list_list(void) +{ + unsigned long pfn; + + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); + } + + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } +} + +void xen_setup_mfn_list_list(void) +{ + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; + + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + + /* + * As long as the mfn_list has enough entries to completely + * fill a p2m page, pointing into the array is ok. But if + * not the entries beyond the last pfn will be undefined. + */ + if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { + unsigned long p2midx; + + p2midx = max_pfn % P2M_PER_PAGE; + for ( ; p2midx < P2M_PER_PAGE; p2midx++) + mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; + } + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } + + m2p_override_init(); +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) + return INVALID_P2M_ENTRY; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); + } + + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; + + return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } + + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!alloc_p2m(pfn)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} + +#define M2P_OVERRIDE_HASH_SHIFT 10 +#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ + unsigned i; + + m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); + + for (i = 0; i < M2P_OVERRIDE_HASH; i++) + INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ + return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +/* Add an MFN override for a particular page */ +int m2p_add_override(unsigned long mfn, struct page *page) +{ + unsigned long flags; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_add_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + page->private = mfn; + page->index = pfn_to_mfn(pfn); + + __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (!PageHighMem(page)) + /* Just zap old mapping for now */ + pte_clear(&init_mm, address, ptep); + + spin_lock_irqsave(&m2p_override_lock, flags); + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return 0; +} + +int m2p_remove_override(struct page *page) +{ + unsigned long flags; + unsigned long mfn; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return -EINVAL; + + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_remove_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + spin_lock_irqsave(&m2p_override_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&m2p_override_lock, flags); + __set_phys_to_machine(pfn, page->index); + + if (!PageHighMem(page)) + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + /* No tlb flush necessary because the caller already + * left the pte unmapped. */ + + return 0; +} + +struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; + struct page *p, *ret; + + ret = NULL; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (p->private == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ + struct page *p = m2p_find_override(mfn); + unsigned long ret = pfn; + + if (p) + ret = page_to_pfn(p); + + return ret; +} +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0f45638..25c52f9 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int __init check_platform_magic(void) return 0; } -void __init xen_unplug_emulated_devices(void) +void xen_unplug_emulated_devices(void) { int r; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b1dbdaa..a8a66a5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -23,7 +23,6 @@ #include <xen/interface/callback.h> #include <xen/interface/memory.h> #include <xen/interface/physdev.h> -#include <xen/interface/memory.h> #include <xen/features.h> #include "xen-ops.h" @@ -118,16 +117,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, const struct e820map *e820) { phys_addr_t max_addr = PFN_PHYS(max_pfn); - phys_addr_t last_end = 0; + phys_addr_t last_end = ISA_END_ADDRESS; unsigned long released = 0; int i; + /* Free any unused memory above the low 1Mbyte. */ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { phys_addr_t end = e820->map[i].addr; end = min(max_addr, end); - released += xen_release_chunk(last_end, end); - last_end = e820->map[i].addr + e820->map[i].size; + if (last_end < end) + released += xen_release_chunk(last_end, end); + last_end = max(last_end, e820->map[i].addr + e820->map[i].size); } if (last_end < max_addr) @@ -164,6 +165,7 @@ char * __init xen_memory_setup(void) XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { + BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; @@ -177,36 +179,39 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { - unsigned long long end = map[i].addr + map[i].size; + unsigned long long end; + + /* Guard against non-page aligned E820 entries. */ + if (map[i].type == E820_RAM) + map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; - if (map[i].type == E820_RAM) { - if (map[i].addr < mem_end && end > mem_end) { - /* Truncate region to max_mem. */ - u64 delta = end - mem_end; + end = map[i].addr + map[i].size; + if (map[i].type == E820_RAM && end > mem_end) { + /* RAM off the end - may be partially included */ + u64 delta = min(map[i].size, end - mem_end); - map[i].size -= delta; - extra_pages += PFN_DOWN(delta); + map[i].size -= delta; + end -= delta; - end = mem_end; - } + extra_pages += PFN_DOWN(delta); } - if (end > xen_extra_mem_start) + if (map[i].size > 0 && end > xen_extra_mem_start) xen_extra_mem_start = end; - /* If region is non-RAM or below mem_end, add what remains */ - if ((map[i].type != E820_RAM || map[i].addr < mem_end) && - map[i].size > 0) + /* Add region if any remains */ + if (map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } /* - * Even though this is normal, usable memory under Xen, reserve - * ISA memory anyway because too many things think they can poke + * In domU, the ISA region is normal, usable memory, but we + * reserve ISA memory anyway because too many things poke * about in there. * - * In a dom0 kernel, this region is identity mapped with the - * hardware ISA area, so it really is out of bounds. + * In Dom0, the host E820 information can leave gaps in the + * ISA range, which would cause us to release those pages. To + * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); @@ -244,26 +249,11 @@ char * __init xen_memory_setup(void) else extra_pages = 0; - if (!xen_initial_domain()) - xen_add_extra_mem(extra_pages); + xen_add_extra_mem(extra_pages); return "Xen"; } -static void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } -} - /* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes @@ -333,9 +323,6 @@ void __cpuinit xen_enable_syscall(void) void __init xen_arch_setup(void) { - struct physdev_set_iopl set_iopl; - int rc; - xen_panic_handler_init(); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); @@ -352,11 +339,6 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - printk(KERN_INFO "physdev_op failed %d\n", rc); - #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); @@ -368,7 +350,12 @@ void __init xen_arch_setup(void) MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); - pm_idle = xen_idle; + /* Set up idle, making sure it calls safe_halt() pvop */ +#ifdef CONFIG_X86_32 + boot_cpu_data.hlt_works_ok = 1; +#endif + pm_idle = default_idle; + boot_option_idle_override = IDLE_HALT; fiddle_vdso(); } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23e061b..cc9b1e1 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) { struct xen_spinlock *prev; - prev = __get_cpu_var(lock_spinners); - __get_cpu_var(lock_spinners) = xl; + prev = __this_cpu_read(lock_spinners); + __this_cpu_write(lock_spinners, xl); wmb(); /* set lock of interest before count */ @@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock asm(LOCK_PREFIX " decw %0" : "+m" (xl->spinners) : : "memory"); wmb(); /* decrement count before restoring lock */ - __get_cpu_var(lock_spinners) = prev; + __this_cpu_write(lock_spinners, prev); } static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; - int irq = __get_cpu_var(lock_kicker_irq); + int irq = __this_cpu_read(lock_kicker_irq); int ret; u64 start; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d789d5..9bbd63a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled) int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); + xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { for_each_online_cpu(cpu) { xen_setup_runstate_info(cpu); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa..067759e 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -135,24 +135,24 @@ static void do_stolen_accounting(void) /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. */ - stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); + stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __get_cpu_var(xen_residual_stolen) = stolen; + __this_cpu_write(xen_residual_stolen, stolen); account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. */ - blocked += __get_cpu_var(xen_residual_blocked); + blocked += __this_cpu_read(xen_residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __get_cpu_var(xen_residual_blocked) = blocked; + __this_cpu_write(xen_residual_blocked, blocked); account_idle_ticks(ticks); } @@ -426,6 +426,8 @@ void xen_timer_resume(void) { int cpu; + pvclock_resume(); + if (xen_clockevent != &xen_vcpuop_clockevent) return; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 6404474..9d41bf9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -43,7 +43,7 @@ void xen_vcpu_restore(void); void xen_callback_vector(void); void xen_hvm_init_shared_info(void); -void __init xen_unplug_emulated_devices(void); +void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |