aboutsummaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/assigned-dev.c18
-rw-r--r--virt/kvm/eventfd.c9
-rw-r--r--virt/kvm/ioapic.c2
-rw-r--r--virt/kvm/kvm_main.c223
4 files changed, 152 insertions, 100 deletions
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..6cc4b97 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -197,8 +197,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
{
kvm_free_assigned_irq(kvm, assigned_dev);
- __pci_reset_function(assigned_dev->dev);
- pci_restore_state(assigned_dev->dev);
+ pci_reset_function(assigned_dev->dev);
+ if (pci_load_and_free_saved_state(assigned_dev->dev,
+ &assigned_dev->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&assigned_dev->dev->dev));
+ else
+ pci_restore_state(assigned_dev->dev);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
@@ -516,7 +521,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
pci_reset_function(dev);
pci_save_state(dev);
-
+ match->pci_saved_state = pci_store_saved_state(dev);
+ if (!match->pci_saved_state)
+ printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
+ __func__, dev_name(&dev->dev));
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@@ -546,7 +554,9 @@ out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
- pci_restore_state(dev);
+ if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&dev->dev));
list_del(&match->list);
pci_release_regions(dev);
out_disable:
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2ca4535..73358d2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -90,7 +90,7 @@ irqfd_shutdown(struct work_struct *work)
* We know no new events will be scheduled at this point, so block
* until all previously outstanding events have completed
*/
- flush_work(&irqfd->inject);
+ flush_work_sync(&irqfd->inject);
/*
* It is now safe to release the object's resources
@@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
/*
* This rcu_assign_pointer is needed for when
- * another thread calls kvm_irqfd_update before
- * we flush workqueue below.
+ * another thread calls kvm_irq_routing_update before
+ * we flush workqueue below (we synchronize with
+ * kvm_irq_routing_update using irqfds.lock).
* It is paired with synchronize_rcu done by caller
* of that function.
*/
@@ -577,7 +578,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
mutex_lock(&kvm->slots_lock);
- /* Verify that there isnt a match already */
+ /* Verify that there isn't a match already */
if (ioeventfd_check_collision(kvm, p)) {
ret = -EEXIST;
goto unlock_fail;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0b9df83..8df1ca1 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -167,7 +167,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
- entry->fields.dest, entry->fields.dest_mode,
+ entry->fields.dest_id, entry->fields.dest_mode,
entry->fields.delivery_mode, entry->fields.vector,
entry->fields.trig_mode);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f29abeb..22cdb96 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -30,7 +30,7 @@
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
@@ -52,7 +52,6 @@
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm-generic/bitops/le.h>
#include "coalesced_mmio.h"
#include "async_pf.h"
@@ -69,7 +68,7 @@ MODULE_LICENSE("GPL");
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_RAW_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
@@ -137,6 +136,14 @@ void vcpu_load(struct kvm_vcpu *vcpu)
int cpu;
mutex_lock(&vcpu->mutex);
+ if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+ /* The thread running this VCPU changed. */
+ struct pid *oldpid = vcpu->pid;
+ struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+ rcu_assign_pointer(vcpu->pid, newpid);
+ synchronize_rcu();
+ put_pid(oldpid);
+ }
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
@@ -165,13 +172,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- raw_spin_lock(&kvm->requests_lock);
- me = smp_processor_id();
+ me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (kvm_make_check_request(req, vcpu))
- continue;
+ kvm_make_request(req, vcpu);
cpu = vcpu->cpu;
- if (cpus != NULL && cpu != -1 && cpu != me)
+
+ /* Set ->requests bit before we read ->mode */
+ smp_mb();
+
+ if (cpus != NULL && cpu != -1 && cpu != me &&
+ kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
cpumask_set_cpu(cpu, cpus);
}
if (unlikely(cpus == NULL))
@@ -180,7 +190,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
- raw_spin_unlock(&kvm->requests_lock);
+ put_cpu();
free_cpumask_var(cpus);
return called;
}
@@ -209,6 +219,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
+ vcpu->pid = NULL;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
@@ -233,6 +244,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
@@ -455,6 +467,7 @@ static struct kvm *kvm_create_vm(void)
if (!kvm->buses[i])
goto out_err;
}
+ spin_lock_init(&kvm->mmu_lock);
r = kvm_init_mmu_notifier(kvm);
if (r)
@@ -462,16 +475,14 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
- spin_lock_init(&kvm->mmu_lock);
- raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return kvm;
@@ -544,9 +555,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]);
@@ -588,6 +599,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifndef CONFIG_S390
/*
* Allocation size is twice as large as the actual dirty bitmap size.
* This makes it possible to do double buffering: see x86's
@@ -608,6 +620,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
memslot->dirty_bitmap_head = memslot->dirty_bitmap;
return 0;
}
+#endif /* !CONFIG_S390 */
/*
* Allocate some memory and give it an address in the guest physical address
@@ -621,7 +634,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int r, flush_shadow = 0;
+ int r;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
@@ -635,7 +648,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out;
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
goto out;
- if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+ /* We can read the guest memory with __xxx_user() later on. */
+ if (user_alloc &&
+ ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+ !access_ok(VERIFY_WRITE, mem->userspace_addr, mem->memory_size)))
goto out;
if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
goto out;
@@ -741,8 +757,6 @@ skip_lpage:
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
/* destroy any largepage mappings for dirty tracking */
- if (old.npages)
- flush_shadow = 1;
}
#else /* not defined CONFIG_S390 */
new.user_alloc = user_alloc;
@@ -813,9 +827,6 @@ skip_lpage:
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
- if (flush_shadow)
- kvm_arch_flush_shadow(kvm);
-
return 0;
out_free:
@@ -988,23 +999,6 @@ out:
return size;
}
-int memslot_id(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *memslot = NULL;
-
- for (i = 0; i < slots->nmemslots; ++i) {
- memslot = &slots->memslots[i];
-
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- break;
- }
-
- return memslot - slots->memslots;
-}
-
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages)
{
@@ -1029,6 +1023,26 @@ static pfn_t get_fault_pfn(void)
return fault_pfn;
}
+int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int write, struct page **page)
+{
+ int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
+
+ if (write)
+ flags |= FOLL_WRITE;
+
+ return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
+}
+
+static inline int check_user_page_hwpoison(unsigned long addr)
+{
+ int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
+
+ rc = __get_user_pages(current, current->mm, addr, 1,
+ flags, NULL, NULL, NULL);
+ return rc == -EHWPOISON;
+}
+
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
{
@@ -1053,7 +1067,14 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
if (writable)
*writable = write_fault;
- npages = get_user_pages_fast(addr, 1, write_fault, page);
+ if (async) {
+ down_read(&current->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
+ up_read(&current->mm->mmap_sem);
+ } else
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
/* map read fault as writable if possible */
if (unlikely(!write_fault) && npages == 1) {
@@ -1076,7 +1097,8 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
return get_fault_pfn();
down_read(&current->mm->mmap_sem);
- if (is_hwpoison_address(addr)) {
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
up_read(&current->mm->mmap_sem);
get_page(hwpoison_page);
return page_to_pfn(hwpoison_page);
@@ -1264,7 +1286,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = copy_from_user(data, (void __user *)addr + offset, len);
+ r = __copy_from_user(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
@@ -1421,7 +1443,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
- generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
+ __set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -1466,18 +1488,55 @@ void kvm_resched(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_resched);
-void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
- ktime_t expires;
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
-
- /* Sleep for 100 us, and hope lock-holder got scheduled */
- expires = ktime_add_ns(ktime_get(), 100000UL);
- schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ struct kvm *kvm = me->kvm;
+ struct kvm_vcpu *vcpu;
+ int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int yielded = 0;
+ int pass;
+ int i;
- finish_wait(&vcpu->wq, &wait);
+ /*
+ * We boost the priority of a VCPU that is runnable but not
+ * currently running, because it got preempted by something
+ * else and called schedule in __vcpu_run. Hopefully that
+ * VCPU is holding the lock that we need and will release it.
+ * We approximate round-robin by starting at the last boosted VCPU.
+ */
+ for (pass = 0; pass < 2 && !yielded; pass++) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct task_struct *task = NULL;
+ struct pid *pid;
+ if (!pass && i < last_boosted_vcpu) {
+ i = last_boosted_vcpu;
+ continue;
+ } else if (pass && i > last_boosted_vcpu)
+ break;
+ if (vcpu == me)
+ continue;
+ if (waitqueue_active(&vcpu->wq))
+ continue;
+ rcu_read_lock();
+ pid = rcu_dereference(vcpu->pid);
+ if (pid)
+ task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ continue;
+ if (task->flags & PF_VCPU) {
+ put_task_struct(task);
+ continue;
+ }
+ if (yield_to(task, 1)) {
+ put_task_struct(task);
+ kvm->last_boosted_vcpu = i;
+ yielded = 1;
+ break;
+ }
+ put_task_struct(task);
+ }
+ }
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -2122,9 +2181,9 @@ static void hardware_enable_nolock(void *junk)
static void hardware_enable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_enable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_nolock(void *junk)
@@ -2139,9 +2198,9 @@ static void hardware_disable_nolock(void *junk)
static void hardware_disable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_all_nolock(void)
@@ -2155,16 +2214,16 @@ static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_all_nolock();
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static int hardware_enable_all(void)
{
int r = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
kvm_usage_count++;
if (kvm_usage_count == 1) {
@@ -2177,7 +2236,7 @@ static int hardware_enable_all(void)
}
}
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return r;
}
@@ -2339,10 +2398,10 @@ static int vm_stat_get(void *_offset, u64 *val)
struct kvm *kvm;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
*val += *(u32 *)((void *)kvm + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
@@ -2356,12 +2415,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
int i;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
*val += *(u32 *)((void *)vcpu + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
@@ -2392,33 +2451,26 @@ static void kvm_exit_debug(void)
debugfs_remove(kvm_debugfs_dir);
}
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+static int kvm_suspend(void)
{
if (kvm_usage_count)
hardware_disable_nolock(NULL);
return 0;
}
-static int kvm_resume(struct sys_device *dev)
+static void kvm_resume(void)
{
if (kvm_usage_count) {
- WARN_ON(spin_is_locked(&kvm_lock));
+ WARN_ON(raw_spin_is_locked(&kvm_lock));
hardware_enable_nolock(NULL);
}
- return 0;
}
-static struct sysdev_class kvm_sysdev_class = {
- .name = "kvm",
+static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
};
-static struct sys_device kvm_sysdev = {
- .id = 0,
- .cls = &kvm_sysdev_class,
-};
-
struct page *bad_page;
pfn_t bad_pfn;
@@ -2502,14 +2554,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
- r = sysdev_class_register(&kvm_sysdev_class);
- if (r)
- goto out_free_3;
-
- r = sysdev_register(&kvm_sysdev);
- if (r)
- goto out_free_4;
-
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
@@ -2517,7 +2561,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
0, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
- goto out_free_5;
+ goto out_free_3;
}
r = kvm_async_pf_init();
@@ -2534,6 +2578,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_unreg;
}
+ register_syscore_ops(&kvm_syscore_ops);
+
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
@@ -2545,10 +2591,6 @@ out_unreg:
kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
-out_free_5:
- sysdev_unregister(&kvm_sysdev);
-out_free_4:
- sysdev_class_unregister(&kvm_sysdev_class);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
@@ -2576,8 +2618,7 @@ void kvm_exit(void)
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
kvm_async_pf_deinit();
- sysdev_unregister(&kvm_sysdev);
- sysdev_class_unregister(&kvm_sysdev_class);
+ unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(hardware_disable_nolock, NULL, 1);