From 7567cae105e435b53e5a3e778546dd3ec53e3204 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Mar 2010 12:01:10 +0200 Subject: KVM: take srcu lock before call to complete_pio() complete_pio() may use slot table which is protected by srcu. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 24cd0ee..2eb999d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4483,7 +4483,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr8(vcpu, kvm_run->cr8); if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r) goto out; } -- cgit v1.1 From b7af40433870aa0636932ad39b0c48a0cb319057 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 9 Mar 2010 14:55:19 +0900 Subject: KVM: SVM: Fix memory leaks that happen when svm_create_vcpu() fails svm_create_vcpu() does not free the pages allocated during the creation when it fails to complete the allocations. This patch fixes it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 445c594..2ba5820 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -706,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -744,6 +743,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: -- cgit v1.1 From d6a23895aa82353788a1cc5a1d9a1c963465463e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 11 Mar 2010 12:20:03 +0200 Subject: KVM: Don't spam kernel log when injecting exceptions due to bad cr writes These are guest-triggerable. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2eb999d..8f9b08d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -433,8 +433,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #ifdef CONFIG_X86_64 if (cr0 & 0xffffffff00000000UL) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } @@ -443,14 +441,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } @@ -461,15 +456,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -477,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -505,28 +494,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } @@ -547,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -593,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -649,15 +627,12 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; } @@ -667,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); kvm_inject_gp(vcpu, 0); return; } @@ -678,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); kvm_inject_gp(vcpu, 0); return; } -- cgit v1.1 From 114be429c8cd44e57f312af2bbd6734e5a185b0d Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 24 Mar 2010 17:46:42 +0100 Subject: KVM: allow bit 10 to be cleared in MSR_IA32_MC4_CTL There is a quirk for AMD K8 CPUs in many Linux kernels (see arch/x86/kernel/cpu/mcheck/mce.c:__mcheck_cpu_apply_quirks()) that clears bit 10 in that MCE related MSR. KVM can only cope with all zeros or all ones, so it will inject a #GP into the guest, which will let it panic. So lets add a quirk to the quirk and ignore this single cleared bit. This fixes -cpu kvm64 on all machines and -cpu host on K8 machines with some guest Linux kernels. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f9b08d..9ad3d06 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -940,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; -- cgit v1.1 From 78ac8b47c566dd6177a3b9b291b756ccb70670b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 8 Apr 2010 18:19:35 +0300 Subject: KVM: VMX: Save/restore rflags.vm correctly in real mode Currently we set eflags.vm unconditionally when entering real mode emulation through virtual-8086 mode, and clear it unconditionally when we enter protected mode. The means that the following sequence KVM_SET_REGS (rflags.vm=1) KVM_SET_SREGS (cr0.pe=1) Ends up with rflags.vm clear due to KVM_SET_SREGS triggering enter_pmode(). Fix by shadowing rflags.vm (and rflags.iopl) correctly while in real mode: reads and writes to those bits access a shadow register instead of the actual register. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 686492e..bc933cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -77,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -131,7 +133,7 @@ struct vcpu_vmx { } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -818,18 +820,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -1483,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1557,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; -- cgit v1.1 From 77662e0028c7c63e34257fda03ff9625c59d939d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Apr 2010 16:34:42 +0800 Subject: KVM: MMU: fix kvm_mmu_zap_page() and its calling path This patch fix: - calculate zapped page number properly in mmu_zap_unsync_children() - calculate freeed page number properly kvm_mmu_change_mmu_pages() - if zapped children page it shoud restart hlist walking KVM-Stable-Tag. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 48aeee8..19a8906 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1490,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1542,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1596,7 +1598,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } -- cgit v1.1 From 87bf6e7de1134f48681fd2ce4b7c1ec45458cb6d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 12 Apr 2010 19:35:35 +0900 Subject: KVM: fix the handling of dirty bitmaps to avoid overflows Int is not long enough to store the size of a dirty bitmap. This patch fixes this problem with the introduction of a wrapper function to calculate the sizes of dirty bitmaps. Note: in mark_page_dirty(), we have to consider the fact that __set_bit() takes the offset as int, not long. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ad3d06..45aa90f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2612,8 +2612,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r, n, i; + int r, i; struct kvm_memory_slot *memslot; + unsigned long n; unsigned long is_dirty = 0; unsigned long *dirty_bitmap = NULL; @@ -2628,7 +2629,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); r = -ENOMEM; dirty_bitmap = vmalloc(n); -- cgit v1.1 From e8861cfe2c75bdce36655b64d7ce02c2b31b604d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Apr 2010 16:57:11 +0200 Subject: KVM: x86: Fix TSS size check for 16-bit tasks A 16-bit TSS is only 44 bytes long. So make sure to test for the correct size on task switch. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 45aa90f..3c4ca98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5126,6 +5126,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) int ret = 0; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); @@ -5148,7 +5149,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } -- cgit v1.1