aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kvm/api.txt208
-rw-r--r--Documentation/kvm/cpuid.txt42
-rw-r--r--Documentation/kvm/mmu.txt304
-rw-r--r--arch/ia64/kvm/kvm-ia64.c8
-rw-r--r--arch/ia64/kvm/vmm.c2
-rw-r--r--arch/powerpc/include/asm/asm-compat.h2
-rw-r--r--arch/powerpc/include/asm/kvm.h10
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h157
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_32.h42
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h28
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h (renamed from arch/powerpc/include/asm/kvm_book3s_64_asm.h)25
-rw-r--r--arch/powerpc/include/asm/kvm_booke.h96
-rw-r--r--arch/powerpc/include/asm/kvm_fpu.h85
-rw-r--r--arch/powerpc/include/asm/kvm_host.h38
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h97
-rw-r--r--arch/powerpc/include/asm/mmu_context.h2
-rw-r--r--arch/powerpc/include/asm/paca.h10
-rw-r--r--arch/powerpc/include/asm/processor.h3
-rw-r--r--arch/powerpc/include/asm/reg.h10
-rw-r--r--arch/powerpc/kernel/asm-offsets.c102
-rw-r--r--arch/powerpc/kernel/head_32.S14
-rw-r--r--arch/powerpc/kernel/head_64.S4
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c4
-rw-r--r--arch/powerpc/kvm/44x.c2
-rw-r--r--arch/powerpc/kvm/Kconfig24
-rw-r--r--arch/powerpc/kvm/Makefile20
-rw-r--r--arch/powerpc/kvm/book3s.c503
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c54
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c483
-rw-r--r--arch/powerpc/kvm/book3s_32_sr.S143
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c36
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c102
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S183
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c (renamed from arch/powerpc/kvm/book3s_64_emulate.c)245
-rw-r--r--arch/powerpc/kvm/book3s_exports.c (renamed from arch/powerpc/kvm/book3s_64_exports.c)0
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S (renamed from arch/powerpc/kvm/book3s_64_interrupts.S)204
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c1289
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S (renamed from arch/powerpc/kvm/book3s_64_rmhandlers.S)135
-rw-r--r--arch/powerpc/kvm/book3s_segment.S259
-rw-r--r--arch/powerpc/kvm/booke.c21
-rw-r--r--arch/powerpc/kvm/e500.c2
-rw-r--r--arch/powerpc/kvm/emulate.c55
-rw-r--r--arch/powerpc/kvm/fpu.S273
-rw-r--r--arch/powerpc/kvm/powerpc.c110
-rw-r--r--arch/powerpc/mm/mmu_context_hash32.c29
-rw-r--r--arch/s390/kvm/kvm-s390.c6
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/kvm_para.h13
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c225
-rw-r--r--arch/x86/kvm/mmutrace.h84
-rw-r--r--arch/x86/kvm/paging_tmpl.h46
-rw-r--r--arch/x86/kvm/svm.c944
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c378
-rw-r--r--arch/x86/kvm/x86.c1599
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--include/linux/kvm.h26
-rw-r--r--include/linux/kvm_host.h16
-rw-r--r--include/linux/tboot.h1
-rw-r--r--include/trace/events/kvm.h1
-rw-r--r--virt/kvm/assigned-dev.c8
-rw-r--r--virt/kvm/coalesced_mmio.c6
-rw-r--r--virt/kvm/iommu.c4
-rw-r--r--virt/kvm/kvm_main.c63
81 files changed, 7826 insertions, 2811 deletions
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index c6416a3..a237518 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
4.29 KVM_GET_VCPU_EVENTS
Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
Architectures: x86
Type: vm ioctl
Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 pad;
+ __u8 shadow;
} interrupt;
struct {
__u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
__u32 flags;
};
+KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+
4.30 KVM_SET_VCPU_EVENTS
Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
Architectures: x86
Type: vm ioctl
Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,183 @@ current in-kernel state. The bits are:
KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
+If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
+the flags field to signal that interrupt.shadow contains a valid state and
+shall be written into the VCPU.
+
+4.32 KVM_GET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (out)
+Returns: 0 on success, -1 on error
+
+Reads debug registers from the vcpu.
+
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+4.33 KVM_SET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (in)
+Returns: 0 on success, -1 on error
+
+Writes debug registers into the vcpu.
+
+See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
+yet and must be cleared on entry.
+
+4.34 KVM_SET_USER_MEMORY_REGION
+
+Capability: KVM_CAP_USER_MEM
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_userspace_memory_region (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_userspace_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+
+This ioctl allows the user to create or modify a guest physical memory
+slot. When changing an existing slot, it may be moved in the guest
+physical memory space, or its flags may be modified. It may not be
+resized. Slots may not overlap in guest physical address space.
+
+Memory for the region is taken starting at the address denoted by the
+field userspace_addr, which must point at user addressable memory for
+the entire memory slot size. Any object may back this memory, including
+anonymous memory, ordinary files, and hugetlbfs.
+
+It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
+be identical. This allows large pages in the guest to be backed by large
+pages in the host.
+
+The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+instructs kvm to keep track of writes to memory within the slot. See
+the KVM_GET_DIRTY_LOG ioctl.
+
+When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
+region are automatically reflected into the guest. For example, an mmap()
+that affects the region will be made visible immediately. Another example
+is madvise(MADV_DROP).
+
+It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
+The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
+allocation and is deprecated.
+
+4.35 KVM_SET_TSS_ADDR
+
+Capability: KVM_CAP_SET_TSS_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long tss_address (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a three-page region in the guest
+physical address space. The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address. The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts. This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
+4.36 KVM_ENABLE_CAP
+
+Capability: KVM_CAP_ENABLE_CAP
+Architectures: ppc
+Type: vcpu ioctl
+Parameters: struct kvm_enable_cap (in)
+Returns: 0 on success; -1 on error
+
++Not all extensions are enabled by default. Using this ioctl the application
+can enable an extension, making it available to the guest.
+
+On systems that do not support this ioctl, it always fails. On systems that
+do support it, it only works for extensions that are supported for enablement.
+
+To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should
+be used.
+
+struct kvm_enable_cap {
+ /* in */
+ __u32 cap;
+
+The capability that is supposed to get enabled.
+
+ __u32 flags;
+
+A bitfield indicating future enhancements. Has to be 0 for now.
+
+ __u64 args[4];
+
+Arguments for enabling a feature. If a feature needs initial values to
+function properly, this is the place to put them.
+
+ __u8 pad[64];
+};
+
+4.37 KVM_GET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (out)
+Returns: 0 on success; -1 on error
+
+struct kvm_mp_state {
+ __u32 mp_state;
+};
+
+Returns the vcpu's current "multiprocessing state" (though also valid on
+uniprocessor guests).
+
+Possible values are:
+
+ - KVM_MP_STATE_RUNNABLE: the vcpu is currently running
+ - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP)
+ which has not yet received an INIT signal
+ - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is
+ now ready for a SIPI
+ - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and
+ is waiting for an interrupt
+ - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector
+ accesible via KVM_GET_VCPU_EVENTS)
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.
+
+4.38 KVM_SET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (in)
+Returns: 0 on success; -1 on error
+
+Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
+arguments.
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.
5. The kvm_run structure
@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied
by kvm. The 'data' member contains the written data if 'is_write' is
true, and should be filled by application code otherwise.
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
+operations are complete (and guest state is consistent) only after userspace
+has re-entered the kernel with KVM_RUN. The kernel side will first finish
+incomplete operations and then check for pending signals. Userspace
+can re-enter the guest with an unmasked signal pending to complete
+pending operations.
+
/* KVM_EXIT_HYPERCALL */
struct {
__u64 nr;
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise.
__u32 pad;
} hypercall;
-Unused.
+Unused. This was once used for 'hypercall to userspace'. To implement
+such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
+Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.
/* KVM_EXIT_TPR_ACCESS */
struct {
@@ -870,6 +1061,19 @@ s390 specific.
powerpc specific.
+ /* KVM_EXIT_OSI */
+ struct {
+ __u64 gprs[32];
+ } osi;
+
+MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch
+hypercalls and exit with this exit struct that contains all the guest gprs.
+
+If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall.
+Userspace can now handle the hypercall and when it's done modify the gprs as
+necessary. Upon guest entry all guest GPRs will then be replaced by the values
+in this struct.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
new file mode 100644
index 0000000..14a12ea
--- /dev/null
+++ b/Documentation/kvm/cpuid.txt
@@ -0,0 +1,42 @@
+KVM CPUID bits
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+A guest running on a kvm host, can check some of its features using
+cpuid. This is not always guaranteed to work, since userspace can
+mask-out some, or even all KVM-related cpuid features before launching
+a guest.
+
+KVM cpuid functions are:
+
+function: KVM_CPUID_SIGNATURE (0x40000000)
+returns : eax = 0,
+ ebx = 0x4b4d564b,
+ ecx = 0x564b4d56,
+ edx = 0x4d.
+Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+This function queries the presence of KVM cpuid leafs.
+
+
+function: define KVM_CPUID_FEATURES (0x40000001)
+returns : ebx, ecx, edx = 0
+ eax = and OR'ed group of (1 << flag), where each flags is:
+
+
+flag || value || meaning
+=============================================================================
+KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs
+ || || 0x11 and 0x12.
+------------------------------------------------------------------------------
+KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays
+ || || on PIO operations.
+------------------------------------------------------------------------------
+KVM_FEATURE_MMU_OP || 2 || deprecated.
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
+ || || 0x4b564d00 and 0x4b564d01
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
+ || || per-cpu warps are expected in
+ || || kvmclock.
+------------------------------------------------------------------------------
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
new file mode 100644
index 0000000..aaed6ab
--- /dev/null
+++ b/Documentation/kvm/mmu.txt
@@ -0,0 +1,304 @@
+The x86 kvm shadow mmu
+======================
+
+The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible
+for presenting a standard x86 mmu to the guest, while translating guest
+physical addresses to host physical addresses.
+
+The mmu code attempts to satisfy the following requirements:
+
+- correctness: the guest should not be able to determine that it is running
+ on an emulated mmu except for timing (we attempt to comply
+ with the specification, not emulate the characteristics of
+ a particular implementation such as tlb size)
+- security: the guest must not be able to touch host memory not assigned
+ to it
+- performance: minimize the performance penalty imposed by the mmu
+- scaling: need to scale to large memory and large vcpu guests
+- hardware: support the full range of x86 virtualization hardware
+- integration: Linux memory management code must be in control of guest memory
+ so that swapping, page migration, page merging, transparent
+ hugepages, and similar features work without change
+- dirty tracking: report writes to guest memory to enable live migration
+ and framebuffer-based displays
+- footprint: keep the amount of pinned kernel memory low (most memory
+ should be shrinkable)
+- reliablity: avoid multipage or GFP_ATOMIC allocations
+
+Acronyms
+========
+
+pfn host page frame number
+hpa host physical address
+hva host virtual address
+gfn guest frame number
+gpa guest physical address
+gva guest virtual address
+ngpa nested guest physical address
+ngva nested guest virtual address
+pte page table entry (used also to refer generically to paging structure
+ entries)
+gpte guest pte (referring to gfns)
+spte shadow pte (referring to pfns)
+tdp two dimensional paging (vendor neutral term for NPT and EPT)
+
+Virtual and real hardware supported
+===================================
+
+The mmu supports first-generation mmu hardware, which allows an atomic switch
+of the current paging mode and cr3 during guest entry, as well as
+two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware
+it exposes is the traditional 2/3/4 level x86 mmu, with support for global
+pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support
+exposing NPT capable hardware on NPT capable hosts.
+
+Translation
+===========
+
+The primary job of the mmu is to program the processor's mmu to translate
+addresses for the guest. Different translations are required at different
+times:
+
+- when guest paging is disabled, we translate guest physical addresses to
+ host physical addresses (gpa->hpa)
+- when guest paging is enabled, we translate guest virtual addresses, to
+ guest physical addresses, to host physical addresses (gva->gpa->hpa)
+- when the guest launches a guest of its own, we translate nested guest
+ virtual addresses, to nested guest physical addresses, to guest physical
+ addresses, to host physical addresses (ngva->ngpa->gpa->hpa)
+
+The primary challenge is to encode between 1 and 3 translations into hardware
+that support only 1 (traditional) and 2 (tdp) translations. When the
+number of required translations matches the hardware, the mmu operates in
+direct mode; otherwise it operates in shadow mode (see below).
+
+Memory
+======
+
+Guest memory (gpa) is part of the user address space of the process that is
+using kvm. Userspace defines the translation between guest addresses and user
+addresses (gpa->hva); note that two gpas may alias to the same gva, but not
+vice versa.
+
+These gvas may be backed using any method available to the host: anonymous
+memory, file backed memory, and device memory. Memory might be paged by the
+host at any time.
+
+Events
+======
+
+The mmu is driven by events, some from the guest, some from the host.
+
+Guest generated events:
+- writes to control registers (especially cr3)
+- invlpg/invlpga instruction execution
+- access to missing or protected translations
+
+Host generated events:
+- changes in the gpa->hpa translation (either through gpa->hva changes or
+ through hva->hpa changes)
+- memory pressure (the shrinker)
+
+Shadow pages
+============
+
+The principal data structure is the shadow page, 'struct kvm_mmu_page'. A
+shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A
+shadow page may contain a mix of leaf and nonleaf sptes.
+
+A nonleaf spte allows the hardware mmu to reach the leaf pages and
+is not related to a translation directly. It points to other shadow pages.
+
+A leaf spte corresponds to either one or two translations encoded into
+one paging structure entry. These are always the lowest level of the
+translation stack, with optional higher level translations left to NPT/EPT.
+Leaf ptes point at guest pages.
+
+The following table shows translations encoded by leaf ptes, with higher-level
+translations in parentheses:
+
+ Non-nested guests:
+ nonpaging: gpa->hpa
+ paging: gva->gpa->hpa
+ paging, tdp: (gva->)gpa->hpa
+ Nested guests:
+ non-tdp: ngva->gpa->hpa (*)
+ tdp: (ngva->)ngpa->gpa->hpa
+
+(*) the guest hypervisor will encode the ngva->gpa translation into its page
+ tables if npt is not present
+
+Shadow pages contain the following information:
+ role.level:
+ The level in the shadow paging hierarchy that this shadow page belongs to.
+ 1=4k sptes, 2=2M sptes, 3=1G sptes, etc.
+ role.direct:
+ If set, leaf sptes reachable from this page are for a linear range.
+ Examples include real mode translation, large guest pages backed by small
+ host pages, and gpa->hpa translations when NPT or EPT is active.
+ The linear range starts at (gfn << PAGE_SHIFT) and its size is determined
+ by role.level (2MB for first level, 1GB for second level, 0.5TB for third
+ level, 256TB for fourth level)
+ If clear, this page corresponds to a guest page table denoted by the gfn
+ field.
+ role.quadrant:
+ When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit
+ sptes. That means a guest page table contains more ptes than the host,
+ so multiple shadow pages are needed to shadow one guest page.
+ For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
+ first or second 512-gpte block in the guest page table. For second-level
+ page tables, each 32-bit gpte is converted to two 64-bit sptes
+ (since each first-level guest page is shadowed by two first-level
+ shadow pages) so role.quadrant takes values in the range 0..3. Each
+ quadrant maps 1GB virtual address space.
+ role.access:
+ Inherited guest access permissions in the form uwx. Note execute
+ permission is positive, not negative.
+ role.invalid:
+ The page is invalid and should not be used. It is a root page that is
+ currently pinned (by a cpu hardware register pointing to it); once it is
+ unpinned it will be destroyed.
+ role.cr4_pae:
+ Contains the value of cr4.pae for which the page is valid (e.g. whether
+ 32-bit or 64-bit gptes are in use).
+ role.cr4_nxe:
+ Contains the value of efer.nxe for which the page is valid.
+ role.cr0_wp:
+ Contains the value of cr0.wp for which the page is valid.
+ gfn:
+ Either the guest page table containing the translations shadowed by this
+ page, or the base page frame for linear translations. See role.direct.
+ spt:
+ A pageful of 64-bit sptes containing the translations for this page.
+ Accessed by both kvm and hardware.
+ The page pointed to by spt will have its page->private pointing back
+ at the shadow page structure.
+ sptes in spt point either at guest pages, or at lower-level shadow pages.
+ Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point
+ at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte.
+ The spt array forms a DAG structure with the shadow page as a node, and
+ guest pages as leaves.
+ gfns:
+ An array of 512 guest frame numbers, one for each present pte. Used to
+ perform a reverse map from a pte to a gfn.
+ slot_bitmap:
+ A bitmap containing one bit per memory slot. If the page contains a pte
+ mapping a page from memory slot n, then bit n of slot_bitmap will be set
+ (if a page is aliased among several slots, then it is not guaranteed that
+ all slots will be marked).
+ Used during dirty logging to avoid scanning a shadow page if none if its
+ pages need tracking.
+ root_count:
+ A counter keeping track of how many hardware registers (guest cr3 or
+ pdptrs) are now pointing at the page. While this counter is nonzero, the
+ page cannot be destroyed. See role.invalid.
+ multimapped:
+ Whether there exist multiple sptes pointing at this page.
+ parent_pte/parent_ptes:
+ If multimapped is zero, parent_pte points at the single spte that points at
+ this page's spt. Otherwise, parent_ptes points at a data structure
+ with a list of parent_ptes.
+ unsync:
+ If true, then the translations in this page may not match the guest's
+ translation. This is equivalent to the state of the tlb when a pte is
+ changed but before the tlb entry is flushed. Accordingly, unsync ptes
+ are synchronized when the guest executes invlpg or flushes its tlb by
+ other means. Valid for leaf pages.
+ unsync_children:
+ How many sptes in the page point at pages that are unsync (or have
+ unsynchronized children).
+ unsync_child_bitmap:
+ A bitmap indicating which sptes in spt point (directly or indirectly) at
+ pages that may be unsynchronized. Used to quickly locate all unsychronized
+ pages reachable from a given page.
+
+Reverse map
+===========
+
+The mmu maintains a reverse mapping whereby all ptes mapping a page can be
+reached given its gfn. This is used, for example, when swapping out a page.
+
+Synchronized and unsynchronized pages
+=====================================
+
+The guest uses two events to synchronize its tlb and page tables: tlb flushes
+and page invalidations (invlpg).
+
+A tlb flush means that we need to synchronize all sptes reachable from the
+guest's cr3. This is expensive, so we keep all guest page tables write
+protected, and synchronize sptes to gptes when a gpte is written.
+
+A special case is when a guest page table is reachable from the current
+guest cr3. In this case, the guest is obliged to issue an invlpg instruction
+before using the translation. We take advantage of that by removing write
+protection from the guest page, and allowing the guest to modify it freely.
+We synchronize modified gptes when the guest invokes invlpg. This reduces
+the amount of emulation we have to do when the guest modifies multiple gptes,
+or when the a guest page is no longer used as a page table and is used for
+random guest data.
+
+As a side effect we have to resynchronize all reachable unsynchronized shadow
+pages on a tlb flush.
+
+
+Reaction to events
+==================
+
+- guest page fault (or npt page fault, or ept violation)
+
+This is the most complicated event. The cause of a page fault can be:
+
+ - a true guest fault (the guest translation won't allow the access) (*)
+ - access to a missing translation
+ - access to a protected translation
+ - when logging dirty pages, memory is write protected
+ - synchronized shadow pages are write protected (*)
+ - access to untranslatable memory (mmio)
+
+ (*) not applicable in direct mode
+
+Handling a page fault is performed as follows:
+
+ - if needed, walk the guest page tables to determine the guest translation
+ (gva->gpa or ngpa->gpa)
+ - if permissions are insufficient, reflect the fault back to the guest
+ - determine the host page
+ - if this is an mmio request, there is no host page; call the emulator
+ to emulate the instruction instead
+ - walk the shadow page table to find the spte for the translation,
+ instantiating missing intermediate page tables as necessary
+ - try to unsynchronize the page
+ - if successful, we can let the guest continue and modify the gpte
+ - emulate the instruction
+ - if failed, unshadow the page and let the guest continue
+ - update any translations that were modified by the instruction
+
+invlpg handling:
+
+ - walk the shadow page hierarchy and drop affected translations
+ - try to reinstantiate the indicated translation in the hope that the
+ guest will use it in the near future
+
+Guest control register updates:
+
+- mov to cr3
+ - look up new shadow roots
+ - synchronize newly reachable shadow pages
+
+- mov to cr0/cr4/efer
+ - set up mmu context for new paging mode
+ - look up new shadow roots
+ - synchronize newly reachable shadow pages
+
+Host translation updates:
+
+ - mmu notifier called with updated hva
+ - look up affected sptes through reverse map
+ - drop (or update) translations
+
+Further reading
+===============
+
+- NPT presentation from KVM Forum 2008
+ http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf
+
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7f3c0a2..d5f4e91 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -979,11 +979,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
+ r = -ENXIO;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
+ r = -EFAULT;
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
@@ -1379,7 +1381,7 @@ static void kvm_release_vm_pages(struct kvm *kvm)
int i, j;
unsigned long base_gfn;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; i++) {
memslot = &slots->memslots[i];
base_gfn = memslot->base_gfn;
@@ -1535,8 +1537,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_stack, stack,
- sizeof(struct kvm_ia64_vcpu_stack)))
+ sizeof(struct kvm_ia64_vcpu_stack))) {
+ r = -EFAULT;
goto out;
+ }
break;
}
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 7a62f75..f0b9cac 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -51,7 +51,7 @@ static int __init kvm_vmm_init(void)
vmm_fpswa_interface = fpswa_interface;
/*Register vmm data to kvm side*/
- return kvm_init(&vmm_info, 1024, THIS_MODULE);
+ return kvm_init(&vmm_info, 1024, 0, THIS_MODULE);
}
static void __exit kvm_vmm_exit(void)
diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index a9b91ed..2048a6a 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -21,6 +21,7 @@
/* operations for longs and pointers */
#define PPC_LL stringify_in_c(ld)
#define PPC_STL stringify_in_c(std)
+#define PPC_STLU stringify_in_c(stdu)
#define PPC_LCMPI stringify_in_c(cmpdi)
#define PPC_LONG stringify_in_c(.llong)
#define PPC_LONG_ALIGN stringify_in_c(.balign 8)
@@ -44,6 +45,7 @@
/* operations for longs and pointers */
#define PPC_LL stringify_in_c(lwz)
#define PPC_STL stringify_in_c(stw)
+#define PPC_STLU stringify_in_c(stwu)
#define PPC_LCMPI stringify_in_c(cmpwi)
#define PPC_LONG stringify_in_c(.long)
#define PPC_LONG_ALIGN stringify_in_c(.balign 4)
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 81f3b0b..6c5547d 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -77,4 +77,14 @@ struct kvm_debug_exit_arch {
struct kvm_guest_debug_arch {
};
+#define KVM_REG_MASK 0x001f
+#define KVM_REG_EXT_MASK 0xffe0
+#define KVM_REG_GPR 0x0000
+#define KVM_REG_FPR 0x0020
+#define KVM_REG_QPR 0x0040
+#define KVM_REG_FQPR 0x0060
+
+#define KVM_INTERRUPT_SET -1U
+#define KVM_INTERRUPT_UNSET -2U
+
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index aadf2dd..c5ea4cd 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -88,6 +88,8 @@
#define BOOK3S_HFLAG_DCBZ32 0x1
#define BOOK3S_HFLAG_SLB 0x2
+#define BOOK3S_HFLAG_PAIRED_SINGLE 0x4
+#define BOOK3S_HFLAG_NATIVE_PS 0x8
#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index db7db0a..6f74d93 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -22,46 +22,47 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
-#include <asm/kvm_book3s_64_asm.h>
+#include <asm/kvm_book3s_asm.h>
struct kvmppc_slb {
u64 esid;
u64 vsid;
u64 orige;
u64 origv;
- bool valid;
- bool Ks;
- bool Kp;
- bool nx;
- bool large; /* PTEs are 16MB */
- bool tb; /* 1TB segment */
- bool class;
+ bool valid : 1;
+ bool Ks : 1;
+ bool Kp : 1;
+ bool nx : 1;
+ bool large : 1; /* PTEs are 16MB */
+ bool tb : 1; /* 1TB segment */
+ bool class : 1;
};
struct kvmppc_sr {
u32 raw;
u32 vsid;
- bool Ks;
- bool Kp;
- bool nx;
+ bool Ks : 1;
+ bool Kp : 1;
+ bool nx : 1;
+ bool valid : 1;
};
struct kvmppc_bat {
u64 raw;
u32 bepi;
u32 bepi_mask;
- bool vs;
- bool vp;
u32 brpn;
u8 wimg;
u8 pp;
+ bool vs : 1;
+ bool vp : 1;
};
struct kvmppc_sid_map {
u64 guest_vsid;
u64 guest_esid;
u64 host_vsid;
- bool valid;
+ bool valid : 1;
};
#define SID_MAP_BITS 9
@@ -70,7 +71,7 @@ struct kvmppc_sid_map {
struct kvmppc_vcpu_book3s {
struct kvm_vcpu vcpu;
- struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+ struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct kvmppc_slb slb[64];
struct {
@@ -82,9 +83,10 @@ struct kvmppc_vcpu_book3s {
struct kvmppc_bat ibat[8];
struct kvmppc_bat dbat[8];
u64 hid[6];
+ u64 gqr[8];
int slb_nr;
+ u32 dsisr;
u64 sdr1;
- u64 dsisr;
u64 hior;
u64 msr_mask;
u64 vsid_first;
@@ -98,15 +100,15 @@ struct kvmppc_vcpu_book3s {
#define CONTEXT_GUEST 1
#define CONTEXT_GUEST_END 2
-#define VSID_REAL 0xfffffffffff00000
-#define VSID_REAL_DR 0xffffffffffe00000
-#define VSID_REAL_IR 0xffffffffffd00000
-#define VSID_BAT 0xffffffffffc00000
-#define VSID_PR 0x8000000000000000
+#define VSID_REAL 0x1fffffffffc00000ULL
+#define VSID_BAT 0x1fffffffffb00000ULL
+#define VSID_REAL_DR 0x2000000000000000ULL
+#define VSID_REAL_IR 0x4000000000000000ULL
+#define VSID_PR 0x8000000000000000ULL
-extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 ea, u64 ea_mask);
+extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask);
extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
-extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end);
+extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
@@ -114,11 +116,13 @@ extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
-extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data);
-extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr);
+extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
+extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
bool upper, u32 val);
+extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
+extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
extern u32 kvmppc_trampoline_lowmem;
extern u32 kvmppc_trampoline_enter;
@@ -126,6 +130,8 @@ extern void kvmppc_rmcall(ulong srr0, ulong srr1);
extern void kvmppc_load_up_fpu(void);
extern void kvmppc_load_up_altivec(void);
extern void kvmppc_load_up_vsx(void);
+extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
+extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
{
@@ -140,7 +146,108 @@ static inline ulong dsisr(void)
}
extern void kvm_return_point(void);
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ if ( num < 14 ) {
+ to_svcpu(vcpu)->gpr[num] = val;
+ to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
+ } else
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ if ( num < 14 )
+ return to_svcpu(vcpu)->gpr[num];
+ else
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ to_svcpu(vcpu)->cr = val;
+ to_book3s(vcpu)->shadow_vcpu->cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ to_svcpu(vcpu)->xer = val;
+ to_book3s(vcpu)->shadow_vcpu->xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+ to_svcpu(vcpu)->ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+ to_svcpu(vcpu)->lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+ to_svcpu(vcpu)->pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+ ulong pc = kvmppc_get_pc(vcpu);
+ struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+
+ /* Load the instruction manually if it failed to do so in the
+ * exit path */
+ if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+ kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+
+ return svcpu->last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+ return to_svcpu(vcpu)->fault_dar;
+}
+
+/* Magic register values loaded into r3 and r4 before the 'sc' assembly
+ * instruction for the OSI hypercalls */
+#define OSI_SC_MAGIC_R3 0x113724FA
+#define OSI_SC_MAGIC_R4 0x77810F9B
#define INS_DCBZ 0x7c0007ec
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_PPC_BOOK3S_32
+#include <asm/kvm_book3s_32.h>
+#else
+#include <asm/kvm_book3s_64.h>
+#endif
+
#endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h
new file mode 100644
index 0000000..de604db
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_book3s_32.h
@@ -0,0 +1,42 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_32_H__
+#define __ASM_KVM_BOOK3S_32_H__
+
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+{
+ return to_book3s(vcpu)->shadow_vcpu;
+}
+
+#define PTE_SIZE 12
+#define VSID_ALL 0
+#define SR_INVALID 0x00000001 /* VSID 1 should always be unused */
+#define SR_KP 0x20000000
+#define PTE_V 0x80000000
+#define PTE_SEC 0x00000040
+#define PTE_M 0x00000010
+#define PTE_R 0x00000100
+#define PTE_C 0x00000080
+
+#define SID_SHIFT 28
+#define ESID_MASK 0xf0000000
+#define VSID_MASK 0x00fffffff0000000ULL
+
+#endif /* __ASM_KVM_BOOK3S_32_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
new file mode 100644
index 0000000..4cadd61
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -0,0 +1,28 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_64_H__
+#define __ASM_KVM_BOOK3S_64_H__
+
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+{
+ return &get_paca()->shadow_vcpu;
+}
+
+#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 183461b..36fdb3a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -22,7 +22,7 @@
#ifdef __ASSEMBLY__
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
#include <asm/kvm_asm.h>
@@ -55,7 +55,7 @@ kvmppc_resume_\intno:
.macro DO_KVM intno
.endm
-#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+#endif /* CONFIG_KVM_BOOK3S_HANDLER */
#else /*__ASSEMBLY__ */
@@ -63,12 +63,33 @@ struct kvmppc_book3s_shadow_vcpu {
ulong gpr[14];
u32 cr;
u32 xer;
+
+ u32 fault_dsisr;
+ u32 last_inst;
+ ulong ctr;
+ ulong lr;
+ ulong pc;
+ ulong shadow_srr1;
+ ulong fault_dar;
+
ulong host_r1;
ulong host_r2;
ulong handler;
ulong scratch0;
ulong scratch1;
ulong vmhandler;
+ u8 in_guest;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+ u32 sr[16]; /* Guest SRs */
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+ u8 slb_max; /* highest used guest slb entry */
+ struct {
+ u64 esid;
+ u64 vsid;
+ } slb[64]; /* guest SLB */
+#endif
};
#endif /*__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
new file mode 100644
index 0000000..9c9ba3d
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -0,0 +1,96 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOKE_H__
+#define __ASM_KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.xer;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_inst;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+ vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+ vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+ vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pc;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.fault_dear;
+}
+
+#endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_fpu.h b/arch/powerpc/include/asm/kvm_fpu.h
new file mode 100644
index 0000000..94f05de
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -0,0 +1,85 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright Novell Inc. 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_FPU_H__
+#define __ASM_KVM_FPU_H__
+
+#include <linux/types.h>
+
+extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+
+extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+
+extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+ u32 *src3);
+extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+ u32 *src3);
+extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+ u32 *src3);
+extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+ u32 *src3);
+extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+ u32 *src3);
+
+#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+ u64 *dst, u64 *src1);
+#define FPD_TWO_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+ u64 *dst, u64 *src1, u64 *src2);
+#define FPD_THREE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+ u64 *dst, u64 *src1, u64 *src2, u64 *src3);
+
+extern void fpd_fcmpu(u64 *fpscr, u32 *cr, u64 *src1, u64 *src2);
+extern void fpd_fcmpo(u64 *fpscr, u32 *cr, u64 *src1, u64 *src2);
+
+FPD_ONE_IN(fsqrts)
+FPD_ONE_IN(frsqrtes)
+FPD_ONE_IN(fres)
+FPD_ONE_IN(frsp)
+FPD_ONE_IN(fctiw)
+FPD_ONE_IN(fctiwz)
+FPD_ONE_IN(fsqrt)
+FPD_ONE_IN(fre)
+FPD_ONE_IN(frsqrte)
+FPD_ONE_IN(fneg)
+FPD_ONE_IN(fabs)
+FPD_TWO_IN(fadds)
+FPD_TWO_IN(fsubs)
+FPD_TWO_IN(fdivs)
+FPD_TWO_IN(fmuls)
+FPD_TWO_IN(fcpsgn)
+FPD_TWO_IN(fdiv)
+FPD_TWO_IN(fadd)
+FPD_TWO_IN(fmul)
+FPD_TWO_IN(fsub)
+FPD_THREE_IN(fmsubs)
+FPD_THREE_IN(fmadds)
+FPD_THREE_IN(fnmsubs)
+FPD_THREE_IN(fnmadds)
+FPD_THREE_IN(fsel)
+FPD_THREE_IN(fmsub)
+FPD_THREE_IN(fmadd)
+FPD_THREE_IN(fnmsub)
+FPD_THREE_IN(fnmadd)
+
+#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5e5bae7..0c9ad86 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -66,7 +66,7 @@ struct kvm_vcpu_stat {
u32 dec_exits;
u32 ext_intr_exits;
u32 halt_wakeup;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
u32 pf_storage;
u32 pf_instruc;
u32 sp_storage;
@@ -124,12 +124,12 @@ struct kvm_arch {
};
struct kvmppc_pte {
- u64 eaddr;
+ ulong eaddr;
u64 vpage;
- u64 raddr;
- bool may_read;
- bool may_write;
- bool may_execute;
+ ulong raddr;
+ bool may_read : 1;
+ bool may_write : 1;
+ bool may_execute : 1;
};
struct kvmppc_mmu {
@@ -145,7 +145,7 @@ struct kvmppc_mmu {
int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data);
void (*reset_msr)(struct kvm_vcpu *vcpu);
void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
- int (*esid_to_vsid)(struct kvm_vcpu *vcpu, u64 esid, u64 *vsid);
+ int (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid);
u64 (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data);
bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
};
@@ -160,7 +160,7 @@ struct hpte_cache {
struct kvm_vcpu_arch {
ulong host_stack;
u32 host_pid;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
ulong host_msr;
ulong host_r2;
void *host_retip;
@@ -175,7 +175,7 @@ struct kvm_vcpu_arch {
ulong gpr[32];
u64 fpr[32];
- u32 fpscr;
+ u64 fpscr;
#ifdef CONFIG_ALTIVEC
vector128 vr[32];
@@ -186,19 +186,23 @@ struct kvm_vcpu_arch {
u64 vsr[32];
#endif
+#ifdef CONFIG_PPC_BOOK3S
+ /* For Gekko paired singles */
+ u32 qpr[32];
+#endif
+
+#ifdef CONFIG_BOOKE
ulong pc;
ulong ctr;
ulong lr;
-#ifdef CONFIG_BOOKE
ulong xer;
u32 cr;
#endif
ulong msr;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
ulong shadow_msr;
- ulong shadow_srr1;
ulong hflags;
ulong guest_owned_ext;
#endif
@@ -253,20 +257,22 @@ struct kvm_vcpu_arch {
struct dentry *debugfs_exit_timing;
#endif
+#ifdef CONFIG_BOOKE
u32 last_inst;
-#ifdef CONFIG_PPC64
- ulong fault_dsisr;
-#endif
ulong fault_dear;
ulong fault_esr;
ulong queued_dear;
ulong queued_esr;
+#endif
gpa_t paddr_accessed;
u8 io_gpr; /* GPR used as IO source/target */
u8 mmio_is_bigendian;
+ u8 mmio_sign_extend;
u8 dcr_needed;
u8 dcr_is_write;
+ u8 osi_needed;
+ u8 osi_enabled;
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -275,7 +281,7 @@ struct kvm_vcpu_arch {
u64 dec_jiffies;
unsigned long pending_exceptions;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
int hpte_cache_offset;
#endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e264282..18d139e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -30,6 +30,8 @@
#include <linux/kvm_host.h>
#ifdef CONFIG_PPC_BOOK3S
#include <asm/kvm_book3s.h>
+#else
+#include <asm/kvm_booke.h>
#endif
enum emulation_result {
@@ -37,6 +39,7 @@ enum emulation_result {
EMULATE_DO_MMIO, /* kvm_run filled with MMIO request */
EMULATE_DO_DCR, /* kvm_run filled with DCR request */
EMULATE_FAIL, /* can't emulate this instruction */
+ EMULATE_AGAIN, /* something went wrong. go again */
};
extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -48,8 +51,11 @@ extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int rt, unsigned int bytes,
int is_bigendian);
+extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int rt, unsigned int bytes,
+ int is_bigendian);
extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
- u32 val, unsigned int bytes, int is_bigendian);
+ u64 val, unsigned int bytes, int is_bigendian);
extern int kvmppc_emulate_instruction(struct kvm_run *run,
struct kvm_vcpu *vcpu);
@@ -63,6 +69,7 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_init(struct kvm_vcpu *vcpu);
extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
@@ -88,6 +95,8 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq);
+extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq);
extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int op, int *advance);
@@ -99,81 +108,37 @@ extern void kvmppc_booke_exit(void);
extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
-#ifdef CONFIG_PPC_BOOK3S
-
-/* We assume we're always acting on the current vcpu */
-
-static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
-{
- if ( num < 14 ) {
- get_paca()->shadow_vcpu.gpr[num] = val;
- to_book3s(vcpu)->shadow_vcpu.gpr[num] = val;
- } else
- vcpu->arch.gpr[num] = val;
-}
-
-static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
-{
- if ( num < 14 )
- return get_paca()->shadow_vcpu.gpr[num];
- else
- return vcpu->arch.gpr[num];
-}
-
-static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
-{
- get_paca()->shadow_vcpu.cr = val;
- to_book3s(vcpu)->shadow_vcpu.cr = val;
-}
-
-static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
-{
- return get_paca()->shadow_vcpu.cr;
-}
-
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
-{
- get_paca()->shadow_vcpu.xer = val;
- to_book3s(vcpu)->shadow_vcpu.xer = val;
-}
-
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+/*
+ * Cuts out inst bits with ordering according to spec.
+ * That means the leftmost bit is zero. All given bits are included.
+ */
+static inline u32 kvmppc_get_field(u64 inst, int msb, int lsb)
{
- return get_paca()->shadow_vcpu.xer;
-}
+ u32 r;
+ u32 mask;
-#else
+ BUG_ON(msb > lsb);
-static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
-{
- vcpu->arch.gpr[num] = val;
-}
+ mask = (1 << (lsb - msb + 1)) - 1;
+ r = (inst >> (63 - lsb)) & mask;
-static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
-{
- return vcpu->arch.gpr[num];
+ return r;
}
-static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+/*
+ * Replaces inst bits with ordering according to spec.
+ */
+static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
{
- vcpu->arch.cr = val;
-}
+ u32 r;
+ u32 mask;
-static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr;
-}
+ BUG_ON(msb > lsb);
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
-{
- vcpu->arch.xer = val;
-}
+ mask = ((1 << (lsb - msb + 1)) - 1) << (63 - lsb);
+ r = (inst & ~mask) | ((value << (63 - lsb)) & mask);
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.xer;
+ return r;
}
-#endif
-
#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 26383e0..81fb412 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -27,6 +27,8 @@ extern int __init_new_context(void);
extern void __destroy_context(int context_id);
static inline void mmu_context_init(void) { }
#else
+extern unsigned long __init_new_context(void);
+extern void __destroy_context(unsigned long context_id);
extern void mmu_context_init(void);
#endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 971dfa4..8ce7963 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -23,7 +23,7 @@
#include <asm/page.h>
#include <asm/exception-64e.h>
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include <asm/kvm_book3s_64_asm.h>
+#include <asm/kvm_book3s_asm.h>
#endif
register struct paca_struct *local_paca asm("r13");
@@ -137,15 +137,9 @@ struct paca_struct {
u64 startpurr; /* PURR/TB value snapshot */
u64 startspurr; /* SPURR value snapshot */
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
- struct {
- u64 esid;
- u64 vsid;
- } kvm_slb[64]; /* guest SLB */
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
/* We use this to store guest state in */
struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
- u8 kvm_slb_max; /* highest used guest slb entry */
- u8 kvm_in_guest; /* are we inside the guest? */
#endif
};
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 221ba62..7492fe8 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -229,6 +229,9 @@ struct thread_struct {
unsigned long spefscr; /* SPE & eFP status */
int used_spe; /* set if process has used spe */
#endif /* CONFIG_SPE */
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+ void* kvm_shadow_vcpu; /* KVM internal data */
+#endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
};
#define ARCH_MIN_TASKALIGN 16
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b68f025..d62fdf4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -293,10 +293,12 @@
#define HID1_ABE (1<<10) /* 7450 Address Broadcast Enable */
#define HID1_PS (1<<16) /* 750FX PLL selection */
#define SPRN_HID2 0x3F8 /* Hardware Implementation Register 2 */
+#define SPRN_HID2_GEKKO 0x398 /* Gekko HID2 Register */
#define SPRN_IABR 0x3F2 /* Instruction Address Breakpoint Register */
#define SPRN_IABR2 0x3FA /* 83xx */
#define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */
#define SPRN_HID4 0x3F4 /* 970 HID4 */
+#define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */
#define SPRN_HID5 0x3F6 /* 970 HID5 */
#define SPRN_HID6 0x3F9 /* BE HID 6 */
#define HID6_LB (0x0F<<12) /* Concurrent Large Page Modes */
@@ -465,6 +467,14 @@
#define SPRN_VRSAVE 0x100 /* Vector Register Save Register */
#define SPRN_XER 0x001 /* Fixed Point Exception Register */
+#define SPRN_MMCR0_GEKKO 0x3B8 /* Gekko Monitor Mode Control Register 0 */
+#define SPRN_MMCR1_GEKKO 0x3BC /* Gekko Monitor Mode Control Register 1 */
+#define SPRN_PMC1_GEKKO 0x3B9 /* Gekko Performance Monitor Control 1 */
+#define SPRN_PMC2_GEKKO 0x3BA /* Gekko Performance Monitor Control 2 */
+#define SPRN_PMC3_GEKKO 0x3BD /* Gekko Performance Monitor Control 3 */
+#define SPRN_PMC4_GEKKO 0x3BE /* Gekko Performance Monitor Control 4 */
+#define SPRN_WPAR_GEKKO 0x399 /* Gekko Write Pipe Address Register */
+
#define SPRN_SCOMC 0x114 /* SCOM Access Control */
#define SPRN_SCOMD 0x115 /* SCOM Access DATA */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 28a686f..496cc5b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -50,6 +50,9 @@
#endif
#ifdef CONFIG_KVM
#include <linux/kvm_host.h>
+#ifndef CONFIG_BOOKE
+#include <asm/kvm_book3s.h>
+#endif
#endif
#ifdef CONFIG_PPC32
@@ -105,6 +108,9 @@ int main(void)
DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
#endif /* CONFIG_SPE */
#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+ DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
+#endif
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
@@ -191,33 +197,9 @@ int main(void)
DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset));
DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
- DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest));
- DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb));
- DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max));
- DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr));
- DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer));
- DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0]));
- DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1]));
- DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2]));
- DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3]));
- DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4]));
- DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5]));
- DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6]));
- DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7]));
- DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8]));
- DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9]));
- DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10]));
- DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11]));
- DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12]));
- DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13]));
- DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1));
- DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2));
- DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct,
- shadow_vcpu.vmhandler));
- DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct,
- shadow_vcpu.scratch0));
- DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct,
- shadow_vcpu.scratch1));
+ DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
+ DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
+ DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
#endif
#endif /* CONFIG_PPC64 */
@@ -228,8 +210,8 @@ int main(void)
/* Interrupt register frame */
DEFINE(STACK_FRAME_OVERHEAD, STACK_FRAME_OVERHEAD);
DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE);
-#ifdef CONFIG_PPC64
DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs));
+#ifdef CONFIG_PPC64
/* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */
DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
@@ -412,9 +394,6 @@ int main(void)
DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
- DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
- DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
- DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
@@ -422,27 +401,68 @@ int main(void)
DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
- DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
- DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
- DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
-
- /* book3s_64 */
-#ifdef CONFIG_PPC64
- DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+ /* book3s */
+#ifdef CONFIG_PPC_BOOK3S
DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
- DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2));
DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
- DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+ DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
+ offsetof(struct kvmppc_vcpu_book3s, vcpu));
+ DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
+ DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
+ DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
+ DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
+ DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
+ DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
+ DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
+ DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
+ DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
+ DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
+ DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
+ DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
+ DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
+ DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
+ DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
+ DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
+ DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
+ DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
+ DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
+ DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
+ DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
+ DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ vmhandler));
+ DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ scratch0));
+ DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ scratch1));
+ DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ in_guest));
+ DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ fault_dsisr));
+ DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ fault_dar));
+ DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ last_inst));
+ DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
+ shadow_srr1));
+#ifdef CONFIG_PPC_BOOK3S_32
+ DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+#endif
#else
DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
-#endif /* CONFIG_PPC64 */
+ DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+ DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+ DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+ DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+ DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
+ DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+#endif /* CONFIG_PPC_BOOK3S */
#endif
#ifdef CONFIG_44x
DEFINE(PGD_T_LOG2, PGD_T_LOG2);
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index e025e89..98c4b29 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -33,6 +33,7 @@
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/bug.h>
+#include <asm/kvm_book3s_asm.h>
/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
#define LOAD_BAT(n, reg, RA, RB) \
@@ -303,6 +304,7 @@ __secondary_hold_acknowledge:
*/
#define EXCEPTION(n, label, hdlr, xfer) \
. = n; \
+ DO_KVM n; \
label: \
EXCEPTION_PROLOG; \
addi r3,r1,STACK_FRAME_OVERHEAD; \
@@ -358,6 +360,7 @@ i##n: \
* -- paulus.
*/
. = 0x200
+ DO_KVM 0x200
mtspr SPRN_SPRG_SCRATCH0,r10
mtspr SPRN_SPRG_SCRATCH1,r11
mfcr r10
@@ -381,6 +384,7 @@ i##n: \
/* Data access exception. */
. = 0x300
+ DO_KVM 0x300
DataAccess:
EXCEPTION_PROLOG
mfspr r10,SPRN_DSISR
@@ -397,6 +401,7 @@ DataAccess:
/* Instruction access exception. */
. = 0x400
+ DO_KVM 0x400
InstructionAccess:
EXCEPTION_PROLOG
andis. r0,r9,0x4000 /* no pte found? */
@@ -413,6 +418,7 @@ InstructionAccess:
/* Alignment exception */
. = 0x600
+ DO_KVM 0x600
Alignment:
EXCEPTION_PROLOG
mfspr r4,SPRN_DAR
@@ -427,6 +433,7 @@ Alignment:
/* Floating-point unavailable */
. = 0x800
+ DO_KVM 0x800
FPUnavailable:
BEGIN_FTR_SECTION
/*
@@ -450,6 +457,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
/* System call */
. = 0xc00
+ DO_KVM 0xc00
SystemCall:
EXCEPTION_PROLOG
EXC_XFER_EE_LITE(0xc00, DoSyscall)
@@ -467,9 +475,11 @@ SystemCall:
* by executing an altivec instruction.
*/
. = 0xf00
+ DO_KVM 0xf00
b PerformanceMonitor
. = 0xf20
+ DO_KVM 0xf20
b AltiVecUnavailable
/*
@@ -882,6 +892,10 @@ __secondary_start:
RFI
#endif /* CONFIG_SMP */
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+#include "../kvm/book3s_rmhandlers.S"
+#endif
+
/*
* Those generic dummy functions are kept for CPUs not
* included in CONFIG_6xx
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index bed9a29..844a44b 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -37,7 +37,7 @@
#include <asm/firmware.h>
#include <asm/page_64.h>
#include <asm/irqflags.h>
-#include <asm/kvm_book3s_64_asm.h>
+#include <asm/kvm_book3s_asm.h>
/* The physical memory is layed out such that the secondary processor
* spin code sits at 0x0000...0x00ff. On server, the vectors follow
@@ -169,7 +169,7 @@ exception_marker:
/* KVM trampoline code needs to be close to the interrupt handlers */
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include "../kvm/book3s_64_rmhandlers.S"
+#include "../kvm/book3s_rmhandlers.S"
#endif
_GLOBAL(generic_secondary_thread_init)
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index ab3e392..bc9f39d 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,6 +101,10 @@ EXPORT_SYMBOL(pci_dram_offset);
EXPORT_SYMBOL(start_thread);
EXPORT_SYMBOL(kernel_thread);
+#ifndef CONFIG_BOOKE
+EXPORT_SYMBOL_GPL(cvt_df);
+EXPORT_SYMBOL_GPL(cvt_fd);
+#endif
EXPORT_SYMBOL(giveup_fpu);
#ifdef CONFIG_ALTIVEC
EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 689a57c..73c0a3f 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -147,7 +147,7 @@ static int __init kvmppc_44x_init(void)
if (r)
return r;
- return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+ return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
}
static void __exit kvmppc_44x_exit(void)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 60624cc..b7baff7 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,12 +22,34 @@ config KVM
select ANON_INODES
select KVM_MMIO
+config KVM_BOOK3S_HANDLER
+ bool
+
+config KVM_BOOK3S_32_HANDLER
+ bool
+ select KVM_BOOK3S_HANDLER
+
config KVM_BOOK3S_64_HANDLER
bool
+ select KVM_BOOK3S_HANDLER
+
+config KVM_BOOK3S_32
+ tristate "KVM support for PowerPC book3s_32 processors"
+ depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
+ select KVM
+ select KVM_BOOK3S_32_HANDLER
+ ---help---
+ Support running unmodified book3s_32 guest kernels
+ in virtual machines on book3s_32 host processors.
+
+ This module provides access to the hardware capabilities through
+ a character device node named /dev/kvm.
+
+ If unsure, say N.
config KVM_BOOK3S_64
tristate "KVM support for PowerPC book3s_64 processors"
- depends on EXPERIMENTAL && PPC64
+ depends on EXPERIMENTAL && PPC_BOOK3S_64
select KVM
select KVM_BOOK3S_64_HANDLER
---help---
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 56484d6..ff43606 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -14,7 +14,7 @@ CFLAGS_emulate.o := -I.
common-objs-y += powerpc.o emulate.o
obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
-obj-$(CONFIG_KVM_BOOK3S_64_HANDLER) += book3s_64_exports.o
+obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
AFLAGS_booke_interrupts.o := -I$(obj)
@@ -40,17 +40,31 @@ kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
kvm-book3s_64-objs := \
$(common-objs-y) \
+ fpu.o \
+ book3s_paired_singles.o \
book3s.o \
- book3s_64_emulate.o \
- book3s_64_interrupts.o \
+ book3s_emulate.o \
+ book3s_interrupts.o \
book3s_64_mmu_host.o \
book3s_64_mmu.o \
book3s_32_mmu.o
kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+kvm-book3s_32-objs := \
+ $(common-objs-y) \
+ fpu.o \
+ book3s_paired_singles.o \
+ book3s.o \
+ book3s_emulate.o \
+ book3s_interrupts.o \
+ book3s_32_mmu_host.o \
+ book3s_32_mmu.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
+
kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
obj-$(CONFIG_KVM_440) += kvm.o
obj-$(CONFIG_KVM_E500) += kvm.o
obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 604af29..b998abf 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -16,6 +16,7 @@
#include <linux/kvm_host.h>
#include <linux/err.h>
+#include <linux/slab.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -29,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <linux/highmem.h>
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -36,7 +38,15 @@
/* #define EXIT_DEBUG_SIMPLE */
/* #define DEBUG_EXT */
-static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+ ulong msr);
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#endif
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exits", VCPU_STAT(sum_exits) },
@@ -69,18 +79,26 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb));
- memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu,
+#ifdef CONFIG_PPC_BOOK3S_64
+ memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+ memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
sizeof(get_paca()->shadow_vcpu));
- get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max;
+ to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+ current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
+#endif
}
void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
{
- memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb));
- memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+#ifdef CONFIG_PPC_BOOK3S_64
+ memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+ memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
sizeof(get_paca()->shadow_vcpu));
- to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max;
+ to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+#endif
kvmppc_giveup_ext(vcpu, MSR_FP);
kvmppc_giveup_ext(vcpu, MSR_VEC);
@@ -131,18 +149,22 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
}
}
- if (((vcpu->arch.msr & (MSR_IR|MSR_DR)) != (old_msr & (MSR_IR|MSR_DR))) ||
- (vcpu->arch.msr & MSR_PR) != (old_msr & MSR_PR)) {
+ if ((vcpu->arch.msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+ (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
kvmppc_mmu_flush_segments(vcpu);
- kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc);
+ kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
}
+
+ /* Preload FPU if it's enabled */
+ if (vcpu->arch.msr & MSR_FP)
+ kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
}
void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
{
- vcpu->arch.srr0 = vcpu->arch.pc;
+ vcpu->arch.srr0 = kvmppc_get_pc(vcpu);
vcpu->arch.srr1 = vcpu->arch.msr | flags;
- vcpu->arch.pc = to_book3s(vcpu)->hior + vec;
+ kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
vcpu->arch.mmu.reset_msr(vcpu);
}
@@ -218,6 +240,12 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
}
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+}
+
int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
{
int deliver = 1;
@@ -302,7 +330,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
#endif
priority = __ffs(*pending);
- while (priority <= (sizeof(unsigned int) * 8)) {
+ while (priority < BOOK3S_IRQPRIO_MAX) {
if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
(priority != BOOK3S_IRQPRIO_DECREMENTER)) {
/* DEC interrupts get cleared by mtdec */
@@ -318,13 +346,18 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
{
+ u32 host_pvr;
+
vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
kvmppc_mmu_book3s_64_init(vcpu);
to_book3s(vcpu)->hior = 0xfff00000;
to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
- } else {
+ } else
+#endif
+ {
kvmppc_mmu_book3s_32_init(vcpu);
to_book3s(vcpu)->hior = 0;
to_book3s(vcpu)->msr_mask = 0xffffffffULL;
@@ -337,6 +370,32 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
!strcmp(cur_cpu_spec->platform, "ppc970"))
vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+ /* Cell performs badly if MSR_FEx are set. So let's hope nobody
+ really needs them in a VM on Cell and force disable them. */
+ if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+ to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+ /* 32 bit Book3S always has 32 byte dcbz */
+ vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+ /* On some CPUs we can execute paired single operations natively */
+ asm ( "mfpvr %0" : "=r"(host_pvr));
+ switch (host_pvr) {
+ case 0x00080200: /* lonestar 2.0 */
+ case 0x00088202: /* lonestar 2.2 */
+ case 0x70000100: /* gekko 1.0 */
+ case 0x00080100: /* gekko 2.0 */
+ case 0x00083203: /* gekko 2.3a */
+ case 0x00083213: /* gekko 2.3b */
+ case 0x00083204: /* gekko 2.4 */
+ case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */
+ case 0x00087200: /* broadway */
+ vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+ /* Enable HID2.PSE - in case we need it later */
+ mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+ }
}
/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
@@ -350,34 +409,29 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
*/
static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
{
- bool touched = false;
- hva_t hpage;
+ struct page *hpage;
+ u64 hpage_offset;
u32 *page;
int i;
- hpage = gfn_to_hva(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
- if (kvm_is_error_hva(hpage))
+ hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+ if (is_error_page(hpage))
return;
- hpage |= pte->raddr & ~PAGE_MASK;
- hpage &= ~0xFFFULL;
-
- page = vmalloc(HW_PAGE_SIZE);
-
- if (copy_from_user(page, (void __user *)hpage, HW_PAGE_SIZE))
- goto out;
+ hpage_offset = pte->raddr & ~PAGE_MASK;
+ hpage_offset &= ~0xFFFULL;
+ hpage_offset /= 4;
- for (i=0; i < HW_PAGE_SIZE / 4; i++)
- if ((page[i] & 0xff0007ff) == INS_DCBZ) {
- page[i] &= 0xfffffff7; // reserved instruction, so we trap
- touched = true;
- }
+ get_page(hpage);
+ page = kmap_atomic(hpage, KM_USER0);
- if (touched)
- copy_to_user((void __user *)hpage, page, HW_PAGE_SIZE);
+ /* patch dcbz into reserved instruction, so we trap */
+ for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+ if ((page[i] & 0xff0007ff) == INS_DCBZ)
+ page[i] &= 0xfffffff7;
-out:
- vfree(page);
+ kunmap_atomic(page, KM_USER0);
+ put_page(hpage);
}
static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
@@ -391,15 +445,7 @@ static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
} else {
pte->eaddr = eaddr;
pte->raddr = eaddr & 0xffffffff;
- pte->vpage = eaddr >> 12;
- switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
- case 0:
- pte->vpage |= VSID_REAL;
- case MSR_DR:
- pte->vpage |= VSID_REAL_DR;
- case MSR_IR:
- pte->vpage |= VSID_REAL_IR;
- }
+ pte->vpage = VSID_REAL | eaddr >> 12;
pte->may_read = true;
pte->may_write = true;
pte->may_execute = true;
@@ -434,55 +480,55 @@ err:
return kvmppc_bad_hva();
}
-int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr)
+int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
+ bool data)
{
struct kvmppc_pte pte;
- hva_t hva = eaddr;
vcpu->stat.st++;
- if (kvmppc_xlate(vcpu, eaddr, false, &pte))
- goto err;
+ if (kvmppc_xlate(vcpu, *eaddr, data, &pte))
+ return -ENOENT;
- hva = kvmppc_pte_to_hva(vcpu, &pte, false);
- if (kvm_is_error_hva(hva))
- goto err;
+ *eaddr = pte.raddr;
- if (copy_to_user((void __user *)hva, ptr, size)) {
- printk(KERN_INFO "kvmppc_st at 0x%lx failed\n", hva);
- goto err;
- }
+ if (!pte.may_write)
+ return -EPERM;
- return 0;
+ if (kvm_write_guest(vcpu->kvm, pte.raddr, ptr, size))
+ return EMULATE_DO_MMIO;
-err:
- return -ENOENT;
+ return EMULATE_DONE;
}
-int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr,
+int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
bool data)
{
struct kvmppc_pte pte;
- hva_t hva = eaddr;
+ hva_t hva = *eaddr;
vcpu->stat.ld++;
- if (kvmppc_xlate(vcpu, eaddr, data, &pte))
- goto err;
+ if (kvmppc_xlate(vcpu, *eaddr, data, &pte))
+ goto nopte;
+
+ *eaddr = pte.raddr;
hva = kvmppc_pte_to_hva(vcpu, &pte, true);
if (kvm_is_error_hva(hva))
- goto err;
+ goto mmio;
if (copy_from_user(ptr, (void __user *)hva, size)) {
printk(KERN_INFO "kvmppc_ld at 0x%lx failed\n", hva);
- goto err;
+ goto mmio;
}
- return 0;
+ return EMULATE_DONE;
-err:
+nopte:
return -ENOENT;
+mmio:
+ return EMULATE_DO_MMIO;
}
static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -499,12 +545,11 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
int page_found = 0;
struct kvmppc_pte pte;
bool is_mmio = false;
+ bool dr = (vcpu->arch.msr & MSR_DR) ? true : false;
+ bool ir = (vcpu->arch.msr & MSR_IR) ? true : false;
+ u64 vsid;
- if ( vec == BOOK3S_INTERRUPT_DATA_STORAGE ) {
- relocated = (vcpu->arch.msr & MSR_DR);
- } else {
- relocated = (vcpu->arch.msr & MSR_IR);
- }
+ relocated = data ? dr : ir;
/* Resolve real address if translation turned on */
if (relocated) {
@@ -516,14 +561,25 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
pte.raddr = eaddr & 0xffffffff;
pte.eaddr = eaddr;
pte.vpage = eaddr >> 12;
- switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
- case 0:
- pte.vpage |= VSID_REAL;
- case MSR_DR:
- pte.vpage |= VSID_REAL_DR;
- case MSR_IR:
- pte.vpage |= VSID_REAL_IR;
- }
+ }
+
+ switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+ case 0:
+ pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+ break;
+ case MSR_DR:
+ case MSR_IR:
+ vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+ if ((vcpu->arch.msr & (MSR_DR|MSR_IR)) == MSR_DR)
+ pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+ else
+ pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+ pte.vpage |= vsid;
+
+ if (vsid == -1)
+ page_found = -EINVAL;
+ break;
}
if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
@@ -538,20 +594,20 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (page_found == -ENOENT) {
/* Page not found in guest PTE entries */
- vcpu->arch.dear = vcpu->arch.fault_dear;
- to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
- vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
+ vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+ to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
+ vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EPERM) {
/* Storage protection */
- vcpu->arch.dear = vcpu->arch.fault_dear;
- to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
+ vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+ to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
- vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
+ vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EINVAL) {
/* Page not found in guest SLB */
- vcpu->arch.dear = vcpu->arch.fault_dear;
+ vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
} else if (!is_mmio &&
kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
@@ -583,11 +639,13 @@ static inline int get_fpr_index(int i)
}
/* Give up external provider (FPU, Altivec, VSX) */
-static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
{
struct thread_struct *t = &current->thread;
u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
u64 *thread_fpr = (u64*)t->fpr;
int i;
@@ -629,21 +687,65 @@ static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
kvmppc_recalc_shadow_msr(vcpu);
}
+static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
+{
+ ulong srr0 = kvmppc_get_pc(vcpu);
+ u32 last_inst = kvmppc_get_last_inst(vcpu);
+ int ret;
+
+ ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+ if (ret == -ENOENT) {
+ vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 33, 1);
+ vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 34, 36, 0);
+ vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
+ kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
+ return EMULATE_AGAIN;
+ }
+
+ return EMULATE_DONE;
+}
+
+static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+
+ /* Need to do paired single emulation? */
+ if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+ return EMULATE_DONE;
+
+ /* Read out the instruction */
+ if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
+ /* Need to emulate */
+ return EMULATE_FAIL;
+
+ return EMULATE_AGAIN;
+}
+
/* Handle external providers (FPU, Altivec, VSX) */
static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
ulong msr)
{
struct thread_struct *t = &current->thread;
u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
u64 *thread_fpr = (u64*)t->fpr;
int i;
+ /* When we have paired singles, we emulate in software */
+ if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+ return RESUME_GUEST;
+
if (!(vcpu->arch.msr & msr)) {
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
return RESUME_GUEST;
}
+ /* We already own the ext */
+ if (vcpu->arch.guest_owned_ext & msr) {
+ return RESUME_GUEST;
+ }
+
#ifdef DEBUG_EXT
printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
#endif
@@ -696,21 +798,33 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
run->ready_for_interrupt_injection = 1;
#ifdef EXIT_DEBUG
printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n",
- exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear,
- kvmppc_get_dec(vcpu), vcpu->arch.msr);
+ exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
+ kvmppc_get_dec(vcpu), to_svcpu(vcpu)->shadow_srr1);
#elif defined (EXIT_DEBUG_SIMPLE)
if ((exit_nr != 0x900) && (exit_nr != 0x500))
printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
- exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear,
+ exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
vcpu->arch.msr);
#endif
kvm_resched(vcpu);
switch (exit_nr) {
case BOOK3S_INTERRUPT_INST_STORAGE:
vcpu->stat.pf_instruc++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+ /* We set segments as unused segments when invalidating them. So
+ * treat the respective fault as segment fault. */
+ if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
+ == SR_INVALID) {
+ kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+ r = RESUME_GUEST;
+ break;
+ }
+#endif
+
/* only care about PTEG not found errors, but leave NX alone */
- if (vcpu->arch.shadow_srr1 & 0x40000000) {
- r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr);
+ if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+ r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
vcpu->stat.sp_instruc++;
} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
@@ -719,37 +833,52 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
* so we can't use the NX bit inside the guest. Let's cross our fingers,
* that no guest that needs the dcbz hack does NX.
*/
- kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
+ kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+ r = RESUME_GUEST;
} else {
- vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000;
+ vcpu->arch.msr |= to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
- kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
+ kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
r = RESUME_GUEST;
}
break;
case BOOK3S_INTERRUPT_DATA_STORAGE:
+ {
+ ulong dar = kvmppc_get_fault_dar(vcpu);
vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+ /* We set segments as unused segments when invalidating them. So
+ * treat the respective fault as segment fault. */
+ if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+ kvmppc_mmu_map_segment(vcpu, dar);
+ r = RESUME_GUEST;
+ break;
+ }
+#endif
+
/* The only case we need to handle is missing shadow PTEs */
- if (vcpu->arch.fault_dsisr & DSISR_NOHPTE) {
- r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.fault_dear, exit_nr);
+ if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+ r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
} else {
- vcpu->arch.dear = vcpu->arch.fault_dear;
- to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
+ vcpu->arch.dear = dar;
+ to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
- kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFULL);
+ kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL);
r = RESUME_GUEST;
}
break;
+ }
case BOOK3S_INTERRUPT_DATA_SEGMENT:
- if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.fault_dear) < 0) {
- vcpu->arch.dear = vcpu->arch.fault_dear;
+ if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+ vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_DATA_SEGMENT);
}
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_INST_SEGMENT:
- if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc) < 0) {
+ if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_INST_SEGMENT);
}
@@ -764,18 +893,22 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
+ case BOOK3S_INTERRUPT_PERFMON:
+ r = RESUME_GUEST;
+ break;
case BOOK3S_INTERRUPT_PROGRAM:
{
enum emulation_result er;
ulong flags;
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+program_interrupt:
+ flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
if (vcpu->arch.msr & MSR_PR) {
#ifdef EXIT_DEBUG
- printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", vcpu->arch.pc, vcpu->arch.last_inst);
+ printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
#endif
- if ((vcpu->arch.last_inst & 0xff0007ff) !=
+ if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
(INS_DCBZ & 0xfffffff7)) {
kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
@@ -789,33 +922,80 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case EMULATE_DONE:
r = RESUME_GUEST_NV;
break;
+ case EMULATE_AGAIN:
+ r = RESUME_GUEST;
+ break;
case EMULATE_FAIL:
printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
- __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+ __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
break;
+ case EMULATE_DO_MMIO:
+ run->exit_reason = KVM_EXIT_MMIO;
+ r = RESUME_HOST_NV;
+ break;
default:
BUG();
}
break;
}
case BOOK3S_INTERRUPT_SYSCALL:
-#ifdef EXIT_DEBUG
- printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0));
-#endif
- vcpu->stat.syscall_exits++;
- kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
- r = RESUME_GUEST;
+ // XXX make user settable
+ if (vcpu->arch.osi_enabled &&
+ (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+ (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+ u64 *gprs = run->osi.gprs;
+ int i;
+
+ run->exit_reason = KVM_EXIT_OSI;
+ for (i = 0; i < 32; i++)
+ gprs[i] = kvmppc_get_gpr(vcpu, i);
+ vcpu->arch.osi_needed = 1;
+ r = RESUME_HOST_NV;
+
+ } else {
+ vcpu->stat.syscall_exits++;
+ kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ r = RESUME_GUEST;
+ }
break;
case BOOK3S_INTERRUPT_FP_UNAVAIL:
- r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP);
- break;
case BOOK3S_INTERRUPT_ALTIVEC:
- r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC);
- break;
case BOOK3S_INTERRUPT_VSX:
- r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX);
+ {
+ int ext_msr = 0;
+
+ switch (exit_nr) {
+ case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break;
+ case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break;
+ case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break;
+ }
+
+ switch (kvmppc_check_ext(vcpu, exit_nr)) {
+ case EMULATE_DONE:
+ /* everything ok - let's enable the ext */
+ r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+ break;
+ case EMULATE_FAIL:
+ /* we need to emulate this instruction */
+ goto program_interrupt;
+ break;
+ default:
+ /* nothing to worry about - go again */
+ break;
+ }
+ break;
+ }
+ case BOOK3S_INTERRUPT_ALIGNMENT:
+ if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
+ to_book3s(vcpu)->dsisr = kvmppc_alignment_dsisr(vcpu,
+ kvmppc_get_last_inst(vcpu));
+ vcpu->arch.dear = kvmppc_alignment_dar(vcpu,
+ kvmppc_get_last_inst(vcpu));
+ kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ }
+ r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
case BOOK3S_INTERRUPT_TRACE:
@@ -825,7 +1005,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
default:
/* Ugh - bork here! What did we get? */
printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
- exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1);
+ exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
r = RESUME_HOST;
BUG();
break;
@@ -852,7 +1032,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
#ifdef EXIT_DEBUG
- printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, vcpu->arch.pc, r);
+ printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, kvmppc_get_pc(vcpu), r);
#endif
return r;
@@ -867,10 +1047,12 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
- regs->pc = vcpu->arch.pc;
+ vcpu_load(vcpu);
+
+ regs->pc = kvmppc_get_pc(vcpu);
regs->cr = kvmppc_get_cr(vcpu);
- regs->ctr = vcpu->arch.ctr;
- regs->lr = vcpu->arch.lr;
+ regs->ctr = kvmppc_get_ctr(vcpu);
+ regs->lr = kvmppc_get_lr(vcpu);
regs->xer = kvmppc_get_xer(vcpu);
regs->msr = vcpu->arch.msr;
regs->srr0 = vcpu->arch.srr0;
@@ -887,6 +1069,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+ vcpu_put(vcpu);
+
return 0;
}
@@ -894,10 +1078,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
- vcpu->arch.pc = regs->pc;
+ vcpu_load(vcpu);
+
+ kvmppc_set_pc(vcpu, regs->pc);
kvmppc_set_cr(vcpu, regs->cr);
- vcpu->arch.ctr = regs->ctr;
- vcpu->arch.lr = regs->lr;
+ kvmppc_set_ctr(vcpu, regs->ctr);
+ kvmppc_set_lr(vcpu, regs->lr);
kvmppc_set_xer(vcpu, regs->xer);
kvmppc_set_msr(vcpu, regs->msr);
vcpu->arch.srr0 = regs->srr0;
@@ -913,6 +1099,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+ vcpu_put(vcpu);
+
return 0;
}
@@ -922,6 +1110,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
+ vcpu_load(vcpu);
+
sregs->pvr = vcpu->arch.pvr;
sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -940,6 +1130,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
}
}
+
+ vcpu_put(vcpu);
+
return 0;
}
@@ -949,6 +1142,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int i;
+ vcpu_load(vcpu);
+
kvmppc_set_pvr(vcpu, sregs->pvr);
vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -975,6 +1170,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Flush the MMU after messing with the segments */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+ vcpu_put(vcpu);
+
return 0;
}
@@ -1042,24 +1240,33 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
{
struct kvmppc_vcpu_book3s *vcpu_book3s;
struct kvm_vcpu *vcpu;
- int err;
+ int err = -ENOMEM;
- vcpu_book3s = (struct kvmppc_vcpu_book3s *)__get_free_pages( GFP_KERNEL | __GFP_ZERO,
- get_order(sizeof(struct kvmppc_vcpu_book3s)));
- if (!vcpu_book3s) {
- err = -ENOMEM;
+ vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
+ if (!vcpu_book3s)
goto out;
- }
+
+ memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
+
+ vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
+ kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
+ if (!vcpu_book3s->shadow_vcpu)
+ goto free_vcpu;
vcpu = &vcpu_book3s->vcpu;
err = kvm_vcpu_init(vcpu, kvm, id);
if (err)
- goto free_vcpu;
+ goto free_shadow_vcpu;
vcpu->arch.host_retip = kvm_return_point;
vcpu->arch.host_msr = mfmsr();
+#ifdef CONFIG_PPC_BOOK3S_64
/* default to book3s_64 (970fx) */
vcpu->arch.pvr = 0x3C0301;
+#else
+ /* default to book3s_32 (750) */
+ vcpu->arch.pvr = 0x84202;
+#endif
kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
vcpu_book3s->slb_nr = 64;
@@ -1067,23 +1274,24 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+#ifdef CONFIG_PPC_BOOK3S_64
vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+#else
+ vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
+#endif
vcpu->arch.shadow_msr = MSR_USER64;
- err = __init_new_context();
+ err = kvmppc_mmu_init(vcpu);
if (err < 0)
- goto free_vcpu;
- vcpu_book3s->context_id = err;
-
- vcpu_book3s->vsid_max = ((vcpu_book3s->context_id + 1) << USER_ESID_BITS) - 1;
- vcpu_book3s->vsid_first = vcpu_book3s->context_id << USER_ESID_BITS;
- vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+ goto free_shadow_vcpu;
return vcpu;
+free_shadow_vcpu:
+ kfree(vcpu_book3s->shadow_vcpu);
free_vcpu:
- free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s)));
+ vfree(vcpu_book3s);
out:
return ERR_PTR(err);
}
@@ -1092,9 +1300,9 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
{
struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
- __destroy_context(vcpu_book3s->context_id);
kvm_vcpu_uninit(vcpu);
- free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s)));
+ kfree(vcpu_book3s->shadow_vcpu);
+ vfree(vcpu_book3s);
}
extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -1102,8 +1310,12 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int ret;
struct thread_struct ext_bkp;
+#ifdef CONFIG_ALTIVEC
bool save_vec = current->thread.used_vr;
+#endif
+#ifdef CONFIG_VSX
bool save_vsx = current->thread.used_vsr;
+#endif
ulong ext_msr;
/* No need to go into the guest when all we do is going out */
@@ -1144,6 +1356,10 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* XXX we get called with irq disabled - change that! */
local_irq_enable();
+ /* Preload FPU if it's enabled */
+ if (vcpu->arch.msr & MSR_FP)
+ kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
local_irq_disable();
@@ -1179,7 +1395,8 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
static int kvmppc_book3s_init(void)
{
- return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), THIS_MODULE);
+ return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+ THIS_MODULE);
}
static void kvmppc_book3s_exit(void)
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index faf99f2..0b10503 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -37,7 +37,7 @@
#define dprintk(X...) do { } while(0)
#endif
-#ifdef DEBUG_PTE
+#ifdef DEBUG_MMU_PTE
#define dprintk_pte(X...) printk(KERN_INFO X)
#else
#define dprintk_pte(X...) do { } while(0)
@@ -45,6 +45,9 @@
#define PTEG_FLAG_ACCESSED 0x00000100
#define PTEG_FLAG_DIRTY 0x00000080
+#ifndef SID_SHIFT
+#define SID_SHIFT 28
+#endif
static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
{
@@ -57,6 +60,8 @@ static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *pte, bool data);
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+ u64 *vsid);
static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
{
@@ -66,13 +71,14 @@ static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t e
static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
bool data)
{
- struct kvmppc_sr *sre = find_sr(to_book3s(vcpu), eaddr);
+ u64 vsid;
struct kvmppc_pte pte;
if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data))
return pte.vpage;
- return (((u64)eaddr >> 12) & 0xffff) | (((u64)sre->vsid) << 16);
+ kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+ return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16);
}
static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
@@ -142,8 +148,13 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
bat->bepi_mask);
}
if ((eaddr & bat->bepi_mask) == bat->bepi) {
+ u64 vsid;
+ kvmppc_mmu_book3s_32_esid_to_vsid(vcpu,
+ eaddr >> SID_SHIFT, &vsid);
+ vsid <<= 16;
+ pte->vpage = (((u64)eaddr >> 12) & 0xffff) | vsid;
+
pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask);
- pte->vpage = (eaddr >> 12) | VSID_BAT;
pte->may_read = bat->pp;
pte->may_write = bat->pp > 1;
pte->may_execute = true;
@@ -172,7 +183,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_sr *sre;
hva_t ptegp;
u32 pteg[16];
- u64 ptem = 0;
+ u32 ptem = 0;
int i;
int found = 0;
@@ -302,6 +313,7 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
/* And then put in the new SR */
sre->raw = value;
sre->vsid = (value & 0x0fffffff);
+ sre->valid = (value & 0x80000000) ? false : true;
sre->Ks = (value & 0x40000000) ? true : false;
sre->Kp = (value & 0x20000000) ? true : false;
sre->nx = (value & 0x10000000) ? true : false;
@@ -312,36 +324,48 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)
{
- kvmppc_mmu_pte_flush(vcpu, ea, ~0xFFFULL);
+ kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000);
}
-static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid,
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
u64 *vsid)
{
+ ulong ea = esid << SID_SHIFT;
+ struct kvmppc_sr *sr;
+ u64 gvsid = esid;
+
+ if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+ sr = find_sr(to_book3s(vcpu), ea);
+ if (sr->valid)
+ gvsid = sr->vsid;
+ }
+
/* In case we only have one of MSR_IR or MSR_DR set, let's put
that in the real-mode context (and hope RM doesn't access
high memory) */
switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
case 0:
- *vsid = (VSID_REAL >> 16) | esid;
+ *vsid = VSID_REAL | esid;
break;
case MSR_IR:
- *vsid = (VSID_REAL_IR >> 16) | esid;
+ *vsid = VSID_REAL_IR | gvsid;
break;
case MSR_DR:
- *vsid = (VSID_REAL_DR >> 16) | esid;
+ *vsid = VSID_REAL_DR | gvsid;
break;
case MSR_DR|MSR_IR:
- {
- ulong ea;
- ea = esid << SID_SHIFT;
- *vsid = find_sr(to_book3s(vcpu), ea)->vsid;
+ if (!sr->valid)
+ return -1;
+
+ *vsid = sr->vsid;
break;
- }
default:
BUG();
}
+ if (vcpu->arch.msr & MSR_PR)
+ *vsid |= VSID_PR;
+
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
new file mode 100644
index 0000000..0bb6600
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ * Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash32.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SR */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SR
+#define dprintk_sr(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_sr(a, ...) do { } while(0)
+#endif
+
+#if PAGE_SHIFT != 12
+#error Unknown page size
+#endif
+
+#ifdef CONFIG_SMP
+#error XXX need to grab mmu_hash_lock
+#endif
+
+#ifdef CONFIG_PTE_64BIT
+#error Only 32 bit pages are supported for now
+#endif
+
+static ulong htab;
+static u32 htabmask;
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+ volatile u32 *pteg;
+
+ dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n",
+ pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+ pteg = (u32*)pte->slot;
+
+ pteg[0] = 0;
+ asm volatile ("sync");
+ asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
+ asm volatile ("sync");
+ asm volatile ("tlbsync");
+
+ pte->host_va = 0;
+
+ if (pte->pte.may_write)
+ kvm_release_pfn_dirty(pte->pfn);
+ else
+ kvm_release_pfn_clean(pte->pfn);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+ int i;
+
+ dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
+ vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
+ BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+ guest_ea &= ea_mask;
+ for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+ struct hpte_cache *pte;
+
+ pte = &vcpu->arch.hpte_cache[i];
+ if (!pte->host_va)
+ continue;
+
+ if ((pte->pte.eaddr & ea_mask) == guest_ea) {
+ invalidate_pte(vcpu, pte);
+ }
+ }
+
+ /* Doing a complete flush -> start from scratch */
+ if (!ea_mask)
+ vcpu->arch.hpte_cache_offset = 0;
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+ int i;
+
+ dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+ vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
+ BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+ guest_vp &= vp_mask;
+ for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+ struct hpte_cache *pte;
+
+ pte = &vcpu->arch.hpte_cache[i];
+ if (!pte->host_va)
+ continue;
+
+ if ((pte->pte.vpage & vp_mask) == guest_vp) {
+ invalidate_pte(vcpu, pte);
+ }
+ }
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+ int i;
+
+ dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
+ vcpu->arch.hpte_cache_offset, pa_start, pa_end);
+ BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+ for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+ struct hpte_cache *pte;
+
+ pte = &vcpu->arch.hpte_cache[i];
+ if (!pte->host_va)
+ continue;
+
+ if ((pte->pte.raddr >= pa_start) &&
+ (pte->pte.raddr < pa_end)) {
+ invalidate_pte(vcpu, pte);
+ }
+ }
+}
+
+struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
+{
+ int i;
+ u64 guest_vp;
+
+ guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
+ for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
+ struct hpte_cache *pte;
+
+ pte = &vcpu->arch.hpte_cache[i];
+ if (!pte->host_va)
+ continue;
+
+ if (pte->pte.vpage == guest_vp)
+ return &pte->pte;
+ }
+
+ return NULL;
+}
+
+static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
+ kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+ return vcpu->arch.hpte_cache_offset++;
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+ return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+ ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+ struct kvmppc_sid_map *map;
+ u16 sid_map_mask;
+
+ if (vcpu->arch.msr & MSR_PR)
+ gvsid |= VSID_PR;
+
+ sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+ map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+ if (map->guest_vsid == gvsid) {
+ dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+ gvsid, map->host_vsid);
+ return map;
+ }
+
+ map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+ if (map->guest_vsid == gvsid) {
+ dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+ gvsid, map->host_vsid);
+ return map;
+ }
+
+ dprintk_sr("SR: Searching 0x%llx -> not found\n", gvsid);
+ return NULL;
+}
+
+static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr,
+ bool primary)
+{
+ u32 page, hash;
+ ulong pteg = htab;
+
+ page = (eaddr & ~ESID_MASK) >> 12;
+
+ hash = ((vsid ^ page) << 6);
+ if (!primary)
+ hash = ~hash;
+
+ hash &= htabmask;
+
+ pteg |= hash;
+
+ dprintk_mmu("htab: %lx | hash: %x | htabmask: %x | pteg: %lx\n",
+ htab, hash, htabmask, pteg);
+
+ return (u32*)pteg;
+}
+
+extern char etext[];
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
+{
+ pfn_t hpaddr;
+ u64 va;
+ u64 vsid;
+ struct kvmppc_sid_map *map;
+ volatile u32 *pteg;
+ u32 eaddr = orig_pte->eaddr;
+ u32 pteg0, pteg1;
+ register int rr = 0;
+ bool primary = false;
+ bool evict = false;
+ int hpte_id;
+ struct hpte_cache *pte;
+
+ /* Get host physical address for gpa */
+ hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+ if (kvm_is_error_hva(hpaddr)) {
+ printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
+ orig_pte->eaddr);
+ return -EINVAL;
+ }
+ hpaddr <<= PAGE_SHIFT;
+
+ /* and write the mapping ea -> hpa into the pt */
+ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+ map = find_sid_vsid(vcpu, vsid);
+ if (!map) {
+ kvmppc_mmu_map_segment(vcpu, eaddr);
+ map = find_sid_vsid(vcpu, vsid);
+ }
+ BUG_ON(!map);
+
+ vsid = map->host_vsid;
+ va = (vsid << SID_SHIFT) | (eaddr & ~ESID_MASK);
+
+next_pteg:
+ if (rr == 16) {
+ primary = !primary;
+ evict = true;
+ rr = 0;
+ }
+
+ pteg = kvmppc_mmu_get_pteg(vcpu, vsid, eaddr, primary);
+
+ /* not evicting yet */
+ if (!evict && (pteg[rr] & PTE_V)) {
+ rr += 2;
+ goto next_pteg;
+ }
+
+ dprintk_mmu("KVM: old PTEG: %p (%d)\n", pteg, rr);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[0], pteg[1]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[2], pteg[3]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[4], pteg[5]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[6], pteg[7]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[8], pteg[9]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[10], pteg[11]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[12], pteg[13]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[14], pteg[15]);
+
+ pteg0 = ((eaddr & 0x0fffffff) >> 22) | (vsid << 7) | PTE_V |
+ (primary ? 0 : PTE_SEC);
+ pteg1 = hpaddr | PTE_M | PTE_R | PTE_C;
+
+ if (orig_pte->may_write) {
+ pteg1 |= PP_RWRW;
+ mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+ } else {
+ pteg1 |= PP_RWRX;
+ }
+
+ local_irq_disable();
+
+ if (pteg[rr]) {
+ pteg[rr] = 0;
+ asm volatile ("sync");
+ }
+ pteg[rr + 1] = pteg1;
+ pteg[rr] = pteg0;
+ asm volatile ("sync");
+
+ local_irq_enable();
+
+ dprintk_mmu("KVM: new PTEG: %p\n", pteg);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[0], pteg[1]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[2], pteg[3]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[4], pteg[5]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[6], pteg[7]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[8], pteg[9]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[10], pteg[11]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[12], pteg[13]);
+ dprintk_mmu("KVM: %08x - %08x\n", pteg[14], pteg[15]);
+
+
+ /* Now tell our Shadow PTE code about the new page */
+
+ hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
+ pte = &vcpu->arch.hpte_cache[hpte_id];
+
+ dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
+ orig_pte->may_write ? 'w' : '-',
+ orig_pte->may_execute ? 'x' : '-',
+ orig_pte->eaddr, (ulong)pteg, va,
+ orig_pte->vpage, hpaddr);
+
+ pte->slot = (ulong)&pteg[rr];
+ pte->host_va = va;
+ pte->pte = *orig_pte;
+ pte->pfn = hpaddr >> PAGE_SHIFT;
+
+ return 0;
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+ struct kvmppc_sid_map *map;
+ struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+ u16 sid_map_mask;
+ static int backwards_map = 0;
+
+ if (vcpu->arch.msr & MSR_PR)
+ gvsid |= VSID_PR;
+
+ /* We might get collisions that trap in preceding order, so let's
+ map them differently */
+
+ sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+ if (backwards_map)
+ sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+ map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+ /* Make sure we're taking the other map next time */
+ backwards_map = !backwards_map;
+
+ /* Uh-oh ... out of mappings. Let's flush! */
+ if (vcpu_book3s->vsid_next >= vcpu_book3s->vsid_max) {
+ vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+ memset(vcpu_book3s->sid_map, 0,
+ sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+ kvmppc_mmu_pte_flush(vcpu, 0, 0);
+ kvmppc_mmu_flush_segments(vcpu);
+ }
+ map->host_vsid = vcpu_book3s->vsid_next;
+
+ /* Would have to be 111 to be completely aligned with the rest of
+ Linux, but that is just way too little space! */
+ vcpu_book3s->vsid_next+=1;
+
+ map->guest_vsid = gvsid;
+ map->valid = true;
+
+ return map;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+ u32 esid = eaddr >> SID_SHIFT;
+ u64 gvsid;
+ u32 sr;
+ struct kvmppc_sid_map *map;
+ struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+
+ if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+ /* Invalidate an entry */
+ svcpu->sr[esid] = SR_INVALID;
+ return -ENOENT;
+ }
+
+ map = find_sid_vsid(vcpu, gvsid);
+ if (!map)
+ map = create_sid_map(vcpu, gvsid);
+
+ map->guest_esid = esid;
+ sr = map->host_vsid | SR_KP;
+ svcpu->sr[esid] = sr;
+
+ dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
+
+ return 0;
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+
+ dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
+ for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
+ svcpu->sr[i] = SR_INVALID;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvmppc_mmu_pte_flush(vcpu, 0, 0);
+ preempt_disable();
+ __destroy_context(to_book3s(vcpu)->context_id);
+ preempt_enable();
+}
+
+/* From mm/mmu_context_hash32.c */
+#define CTX_TO_VSID(ctx) (((ctx) * (897 * 16)) & 0xffffff)
+
+int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+ int err;
+ ulong sdr1;
+
+ err = __init_new_context();
+ if (err < 0)
+ return -1;
+ vcpu3s->context_id = err;
+
+ vcpu3s->vsid_max = CTX_TO_VSID(vcpu3s->context_id + 1) - 1;
+ vcpu3s->vsid_first = CTX_TO_VSID(vcpu3s->context_id);
+
+#if 0 /* XXX still doesn't guarantee uniqueness */
+ /* We could collide with the Linux vsid space because the vsid
+ * wraps around at 24 bits. We're safe if we do our own space
+ * though, so let's always set the highest bit. */
+
+ vcpu3s->vsid_max |= 0x00800000;
+ vcpu3s->vsid_first |= 0x00800000;
+#endif
+ BUG_ON(vcpu3s->vsid_max < vcpu3s->vsid_first);
+
+ vcpu3s->vsid_next = vcpu3s->vsid_first;
+
+ /* Remember where the HTAB is */
+ asm ( "mfsdr1 %0" : "=r"(sdr1) );
+ htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
+ htab = (ulong)__va(sdr1 & 0xffff0000);
+
+ return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
new file mode 100644
index 0000000..3608471
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -0,0 +1,143 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/******************************************************************************
+ * *
+ * Entry code *
+ * *
+ *****************************************************************************/
+
+.macro LOAD_GUEST_SEGMENTS
+
+ /* Required state:
+ *
+ * MSR = ~IR|DR
+ * R1 = host R1
+ * R2 = host R2
+ * R3 = shadow vcpu
+ * all other volatile GPRS = free
+ * SVCPU[CR] = guest CR
+ * SVCPU[XER] = guest XER
+ * SVCPU[CTR] = guest CTR
+ * SVCPU[LR] = guest LR
+ */
+
+#define XCHG_SR(n) lwz r9, (SVCPU_SR+(n*4))(r3); \
+ mtsr n, r9
+
+ XCHG_SR(0)
+ XCHG_SR(1)
+ XCHG_SR(2)
+ XCHG_SR(3)
+ XCHG_SR(4)
+ XCHG_SR(5)
+ XCHG_SR(6)
+ XCHG_SR(7)
+ XCHG_SR(8)
+ XCHG_SR(9)
+ XCHG_SR(10)
+ XCHG_SR(11)
+ XCHG_SR(12)
+ XCHG_SR(13)
+ XCHG_SR(14)
+ XCHG_SR(15)
+
+ /* Clear BATs. */
+
+#define KVM_KILL_BAT(n, reg) \
+ mtspr SPRN_IBAT##n##U,reg; \
+ mtspr SPRN_IBAT##n##L,reg; \
+ mtspr SPRN_DBAT##n##U,reg; \
+ mtspr SPRN_DBAT##n##L,reg; \
+
+ li r9, 0
+ KVM_KILL_BAT(0, r9)
+ KVM_KILL_BAT(1, r9)
+ KVM_KILL_BAT(2, r9)
+ KVM_KILL_BAT(3, r9)
+
+.endm
+
+/******************************************************************************
+ * *
+ * Exit code *
+ * *
+ *****************************************************************************/
+
+.macro LOAD_HOST_SEGMENTS
+
+ /* Register usage at this point:
+ *
+ * R1 = host R1
+ * R2 = host R2
+ * R12 = exit handler id
+ * R13 = shadow vcpu - SHADOW_VCPU_OFF
+ * SVCPU.* = guest *
+ * SVCPU[CR] = guest CR
+ * SVCPU[XER] = guest XER
+ * SVCPU[CTR] = guest CTR
+ * SVCPU[LR] = guest LR
+ *
+ */
+
+ /* Restore BATs */
+
+ /* We only overwrite the upper part, so we only restoree
+ the upper part. */
+#define KVM_LOAD_BAT(n, reg, RA, RB) \
+ lwz RA,(n*16)+0(reg); \
+ lwz RB,(n*16)+4(reg); \
+ mtspr SPRN_IBAT##n##U,RA; \
+ mtspr SPRN_IBAT##n##L,RB; \
+ lwz RA,(n*16)+8(reg); \
+ lwz RB,(n*16)+12(reg); \
+ mtspr SPRN_DBAT##n##U,RA; \
+ mtspr SPRN_DBAT##n##L,RB; \
+
+ lis r9, BATS@ha
+ addi r9, r9, BATS@l
+ tophys(r9, r9)
+ KVM_LOAD_BAT(0, r9, r10, r11)
+ KVM_LOAD_BAT(1, r9, r10, r11)
+ KVM_LOAD_BAT(2, r9, r10, r11)
+ KVM_LOAD_BAT(3, r9, r10, r11)
+
+ /* Restore Segment Registers */
+
+ /* 0xc - 0xf */
+
+ li r0, 4
+ mtctr r0
+ LOAD_REG_IMMEDIATE(r3, 0x20000000 | (0x111 * 0xc))
+ lis r4, 0xc000
+3: mtsrin r3, r4
+ addi r3, r3, 0x111 /* increment VSID */
+ addis r4, r4, 0x1000 /* address of next segment */
+ bdnz 3b
+
+ /* 0x0 - 0xb */
+
+ /* 'current->mm' needs to be in r4 */
+ tophys(r4, r2)
+ lwz r4, MM(r4)
+ tophys(r4, r4)
+ /* This only clobbers r0, r3, r4 and r5 */
+ bl switch_mmu_context
+
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 512dcff..4025ea2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -232,7 +232,7 @@ do_second:
}
dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
- "-> 0x%llx\n",
+ "-> 0x%lx\n",
eaddr, avpn, gpte->vpage, gpte->raddr);
found = true;
break;
@@ -383,7 +383,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
if (vcpu->arch.msr & MSR_IR) {
kvmppc_mmu_flush_segments(vcpu);
- kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc);
+ kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
}
}
@@ -439,37 +439,43 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,
kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask);
}
-static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid,
+static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
u64 *vsid)
{
+ ulong ea = esid << SID_SHIFT;
+ struct kvmppc_slb *slb;
+ u64 gvsid = esid;
+
+ if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+ slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+ if (slb)
+ gvsid = slb->vsid;
+ }
+
switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
case 0:
- *vsid = (VSID_REAL >> 16) | esid;
+ *vsid = VSID_REAL | esid;
break;
case MSR_IR:
- *vsid = (VSID_REAL_IR >> 16) | esid;
+ *vsid = VSID_REAL_IR | gvsid;
break;
case MSR_DR:
- *vsid = (VSID_REAL_DR >> 16) | esid;
+ *vsid = VSID_REAL_DR | gvsid;
break;
case MSR_DR|MSR_IR:
- {
- ulong ea;
- struct kvmppc_slb *slb;
- ea = esid << SID_SHIFT;
- slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
- if (slb)
- *vsid = slb->vsid;
- else
+ if (!slb)
return -ENOENT;
+ *vsid = gvsid;
break;
- }
default:
BUG();
break;
}
+ if (vcpu->arch.msr & MSR_PR)
+ *vsid |= VSID_PR;
+
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index f2899b2..e4b5744 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -48,21 +48,25 @@
static void invalidate_pte(struct hpte_cache *pte)
{
- dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n",
- i, pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+ dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+ pte->pte.eaddr, pte->pte.vpage, pte->host_va);
ppc_md.hpte_invalidate(pte->slot, pte->host_va,
MMU_PAGE_4K, MMU_SEGSIZE_256M,
false);
pte->host_va = 0;
- kvm_release_pfn_dirty(pte->pfn);
+
+ if (pte->pte.may_write)
+ kvm_release_pfn_dirty(pte->pfn);
+ else
+ kvm_release_pfn_clean(pte->pfn);
}
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 guest_ea, u64 ea_mask)
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
{
int i;
- dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%llx & 0x%llx\n",
+ dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
@@ -106,12 +110,12 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
}
}
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end)
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
{
int i;
- dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
- vcpu->arch.hpte_cache_offset, guest_pa, pa_mask);
+ dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
+ vcpu->arch.hpte_cache_offset, pa_start, pa_end);
BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
@@ -182,7 +186,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
map = &to_book3s(vcpu)->sid_map[sid_map_mask];
if (map->guest_vsid == gvsid) {
- dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
+ dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n",
gvsid, map->host_vsid);
return map;
}
@@ -194,7 +198,8 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
return map;
}
- dprintk_slb("SLB: Searching 0x%llx -> not found\n", gvsid);
+ dprintk_slb("SLB: Searching %d/%d: 0x%llx -> not found\n",
+ sid_map_mask, SID_MAP_MASK - sid_map_mask, gvsid);
return NULL;
}
@@ -212,7 +217,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
/* Get host physical address for gpa */
hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
if (kvm_is_error_hva(hpaddr)) {
- printk(KERN_INFO "Couldn't get guest page for gfn %llx!\n", orig_pte->eaddr);
+ printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
return -EINVAL;
}
hpaddr <<= PAGE_SHIFT;
@@ -227,10 +232,16 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
map = find_sid_vsid(vcpu, vsid);
if (!map) {
- kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+ ret = kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+ WARN_ON(ret < 0);
map = find_sid_vsid(vcpu, vsid);
}
- BUG_ON(!map);
+ if (!map) {
+ printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
+ vsid, orig_pte->eaddr);
+ WARN_ON(true);
+ return -EINVAL;
+ }
vsid = map->host_vsid;
va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M);
@@ -257,26 +268,26 @@ map_again:
if (ret < 0) {
/* If we couldn't map a primary PTE, try a secondary */
-#ifdef USE_SECONDARY
hash = ~hash;
+ vflags ^= HPTE_V_SECONDARY;
attempt++;
- if (attempt % 2)
- vflags = HPTE_V_SECONDARY;
- else
- vflags = 0;
-#else
- attempt = 2;
-#endif
goto map_again;
} else {
int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
- dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%lx (0x%llx) -> %lx\n",
+ dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
(rflags & HPTE_R_N) ? '-' : 'x',
orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
+ /* The ppc_md code may give us a secondary entry even though we
+ asked for a primary. Fix up. */
+ if ((ret & _PTEIDX_SECONDARY) && !(vflags & HPTE_V_SECONDARY)) {
+ hash = ~hash;
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+ }
+
pte->slot = hpteg + (ret & 7);
pte->host_va = va;
pte->pte = *orig_pte;
@@ -321,6 +332,9 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
map->guest_vsid = gvsid;
map->valid = true;
+ dprintk_slb("SLB: New mapping at %d: 0x%llx -> 0x%llx\n",
+ sid_map_mask, gvsid, map->host_vsid);
+
return map;
}
@@ -331,14 +345,14 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
int found_inval = -1;
int r;
- if (!get_paca()->kvm_slb_max)
- get_paca()->kvm_slb_max = 1;
+ if (!to_svcpu(vcpu)->slb_max)
+ to_svcpu(vcpu)->slb_max = 1;
/* Are we overwriting? */
- for (i = 1; i < get_paca()->kvm_slb_max; i++) {
- if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V))
+ for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) {
+ if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V))
found_inval = i;
- else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid)
+ else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid)
return i;
}
@@ -352,11 +366,11 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
max_slb_size = mmu_slb_size;
/* Overflowing -> purge */
- if ((get_paca()->kvm_slb_max) == max_slb_size)
+ if ((to_svcpu(vcpu)->slb_max) == max_slb_size)
kvmppc_mmu_flush_segments(vcpu);
- r = get_paca()->kvm_slb_max;
- get_paca()->kvm_slb_max++;
+ r = to_svcpu(vcpu)->slb_max;
+ to_svcpu(vcpu)->slb_max++;
return r;
}
@@ -374,7 +388,7 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
/* Invalidate an entry */
- get_paca()->kvm_slb[slb_index].esid = 0;
+ to_svcpu(vcpu)->slb[slb_index].esid = 0;
return -ENOENT;
}
@@ -388,8 +402,8 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
slb_vsid &= ~SLB_VSID_KP;
slb_esid |= slb_index;
- get_paca()->kvm_slb[slb_index].esid = slb_esid;
- get_paca()->kvm_slb[slb_index].vsid = slb_vsid;
+ to_svcpu(vcpu)->slb[slb_index].esid = slb_esid;
+ to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid;
dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid);
@@ -398,11 +412,29 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
{
- get_paca()->kvm_slb_max = 1;
- get_paca()->kvm_slb[0].esid = 0;
+ to_svcpu(vcpu)->slb_max = 1;
+ to_svcpu(vcpu)->slb[0].esid = 0;
}
void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvmppc_mmu_pte_flush(vcpu, 0, 0);
+ __destroy_context(to_book3s(vcpu)->context_id);
+}
+
+int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+ int err;
+
+ err = __init_new_context();
+ if (err < 0)
+ return -1;
+ vcpu3s->context_id = err;
+
+ vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1;
+ vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
+ vcpu3s->vsid_next = vcpu3s->vsid_first;
+
+ return 0;
}
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 35b76272..04e7d3b 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -44,8 +44,7 @@ slb_exit_skip_ ## num:
* *
*****************************************************************************/
-.global kvmppc_handler_trampoline_enter
-kvmppc_handler_trampoline_enter:
+.macro LOAD_GUEST_SEGMENTS
/* Required state:
*
@@ -53,20 +52,14 @@ kvmppc_handler_trampoline_enter:
* R13 = PACA
* R1 = host R1
* R2 = host R2
- * R9 = guest IP
- * R10 = guest MSR
- * all other GPRS = free
- * PACA[KVM_CR] = guest CR
- * PACA[KVM_XER] = guest XER
+ * R3 = shadow vcpu
+ * all other volatile GPRS = free
+ * SVCPU[CR] = guest CR
+ * SVCPU[XER] = guest XER
+ * SVCPU[CTR] = guest CTR
+ * SVCPU[LR] = guest LR
*/
- mtsrr0 r9
- mtsrr1 r10
-
- /* Activate guest mode, so faults get handled by KVM */
- li r11, KVM_GUEST_MODE_GUEST
- stb r11, PACA_KVM_IN_GUEST(r13)
-
/* Remove LPAR shadow entries */
#if SLB_NUM_BOLTED == 3
@@ -101,14 +94,14 @@ kvmppc_handler_trampoline_enter:
/* Fill SLB with our shadow */
- lbz r12, PACA_KVM_SLB_MAX(r13)
+ lbz r12, SVCPU_SLB_MAX(r3)
mulli r12, r12, 16
- addi r12, r12, PACA_KVM_SLB
- add r12, r12, r13
+ addi r12, r12, SVCPU_SLB
+ add r12, r12, r3
/* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */
- li r11, PACA_KVM_SLB
- add r11, r11, r13
+ li r11, SVCPU_SLB
+ add r11, r11, r3
slb_loop_enter:
@@ -127,34 +120,7 @@ slb_loop_enter_skip:
slb_do_enter:
- /* Enter guest */
-
- ld r0, (PACA_KVM_R0)(r13)
- ld r1, (PACA_KVM_R1)(r13)
- ld r2, (PACA_KVM_R2)(r13)
- ld r3, (PACA_KVM_R3)(r13)
- ld r4, (PACA_KVM_R4)(r13)
- ld r5, (PACA_KVM_R5)(r13)
- ld r6, (PACA_KVM_R6)(r13)
- ld r7, (PACA_KVM_R7)(r13)
- ld r8, (PACA_KVM_R8)(r13)
- ld r9, (PACA_KVM_R9)(r13)
- ld r10, (PACA_KVM_R10)(r13)
- ld r12, (PACA_KVM_R12)(r13)
-
- lwz r11, (PACA_KVM_CR)(r13)
- mtcr r11
-
- ld r11, (PACA_KVM_XER)(r13)
- mtxer r11
-
- ld r11, (PACA_KVM_R11)(r13)
- ld r13, (PACA_KVM_R13)(r13)
-
- RFI
-kvmppc_handler_trampoline_enter_end:
-
-
+.endm
/******************************************************************************
* *
@@ -162,99 +128,22 @@ kvmppc_handler_trampoline_enter_end:
* *
*****************************************************************************/
-.global kvmppc_handler_trampoline_exit
-kvmppc_handler_trampoline_exit:
+.macro LOAD_HOST_SEGMENTS
/* Register usage at this point:
*
- * SPRG_SCRATCH0 = guest R13
- * R12 = exit handler id
- * R13 = PACA
- * PACA.KVM.SCRATCH0 = guest R12
- * PACA.KVM.SCRATCH1 = guest CR
+ * R1 = host R1
+ * R2 = host R2
+ * R12 = exit handler id
+ * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+ * SVCPU.* = guest *
+ * SVCPU[CR] = guest CR
+ * SVCPU[XER] = guest XER
+ * SVCPU[CTR] = guest CTR
+ * SVCPU[LR] = guest LR
*
*/
- /* Save registers */
-
- std r0, PACA_KVM_R0(r13)
- std r1, PACA_KVM_R1(r13)
- std r2, PACA_KVM_R2(r13)
- std r3, PACA_KVM_R3(r13)
- std r4, PACA_KVM_R4(r13)
- std r5, PACA_KVM_R5(r13)
- std r6, PACA_KVM_R6(r13)
- std r7, PACA_KVM_R7(r13)
- std r8, PACA_KVM_R8(r13)
- std r9, PACA_KVM_R9(r13)
- std r10, PACA_KVM_R10(r13)
- std r11, PACA_KVM_R11(r13)
-
- /* Restore R1/R2 so we can handle faults */
- ld r1, PACA_KVM_HOST_R1(r13)
- ld r2, PACA_KVM_HOST_R2(r13)
-
- /* Save guest PC and MSR in GPRs */
- mfsrr0 r3
- mfsrr1 r4
-
- /* Get scratch'ed off registers */
- mfspr r9, SPRN_SPRG_SCRATCH0
- std r9, PACA_KVM_R13(r13)
-
- ld r8, PACA_KVM_SCRATCH0(r13)
- std r8, PACA_KVM_R12(r13)
-
- lwz r7, PACA_KVM_SCRATCH1(r13)
- stw r7, PACA_KVM_CR(r13)
-
- /* Save more register state */
-
- mfxer r6
- stw r6, PACA_KVM_XER(r13)
-
- mfdar r5
- mfdsisr r6
-
- /*
- * In order for us to easily get the last instruction,
- * we got the #vmexit at, we exploit the fact that the
- * virtual layout is still the same here, so we can just
- * ld from the guest's PC address
- */
-
- /* We only load the last instruction when it's safe */
- cmpwi r12, BOOK3S_INTERRUPT_DATA_STORAGE
- beq ld_last_inst
- cmpwi r12, BOOK3S_INTERRUPT_PROGRAM
- beq ld_last_inst
-
- b no_ld_last_inst
-
-ld_last_inst:
- /* Save off the guest instruction we're at */
-
- /* Set guest mode to 'jump over instruction' so if lwz faults
- * we'll just continue at the next IP. */
- li r9, KVM_GUEST_MODE_SKIP
- stb r9, PACA_KVM_IN_GUEST(r13)
-
- /* 1) enable paging for data */
- mfmsr r9
- ori r11, r9, MSR_DR /* Enable paging for data */
- mtmsr r11
- /* 2) fetch the instruction */
- li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */
- lwz r0, 0(r3)
- /* 3) disable paging again */
- mtmsr r9
-
-no_ld_last_inst:
-
- /* Unset guest mode */
- li r9, KVM_GUEST_MODE_NONE
- stb r9, PACA_KVM_IN_GUEST(r13)
-
/* Restore bolted entries from the shadow and fix it along the way */
/* We don't store anything in entry 0, so we don't need to take care of it */
@@ -275,28 +164,4 @@ no_ld_last_inst:
slb_do_exit:
- /* Register usage at this point:
- *
- * R0 = guest last inst
- * R1 = host R1
- * R2 = host R2
- * R3 = guest PC
- * R4 = guest MSR
- * R5 = guest DAR
- * R6 = guest DSISR
- * R12 = exit handler id
- * R13 = PACA
- * PACA.KVM.* = guest *
- *
- */
-
- /* RFI into the highmem handler */
- mfmsr r7
- ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */
- mtsrr1 r7
- ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */
- mtsrr0 r8
-
- RFI
-kvmppc_handler_trampoline_exit_end:
-
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 2b0ee7e..c85f906 100644
--- a/arch/powerpc/kvm/book3s_64_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,13 +28,16 @@
#define OP_31_XOP_MFMSR 83
#define OP_31_XOP_MTMSR 146
#define OP_31_XOP_MTMSRD 178
+#define OP_31_XOP_MTSR 210
#define OP_31_XOP_MTSRIN 242
#define OP_31_XOP_TLBIEL 274
#define OP_31_XOP_TLBIE 306
#define OP_31_XOP_SLBMTE 402
#define OP_31_XOP_SLBIE 434
#define OP_31_XOP_SLBIA 498
+#define OP_31_XOP_MFSR 595
#define OP_31_XOP_MFSRIN 659
+#define OP_31_XOP_DCBA 758
#define OP_31_XOP_SLBMFEV 851
#define OP_31_XOP_EIOIO 854
#define OP_31_XOP_SLBMFEE 915
@@ -42,6 +45,24 @@
/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
#define OP_31_XOP_DCBZ 1010
+#define OP_LFS 48
+#define OP_LFD 50
+#define OP_STFS 52
+#define OP_STFD 54
+
+#define SPRN_GQR0 912
+#define SPRN_GQR1 913
+#define SPRN_GQR2 914
+#define SPRN_GQR3 915
+#define SPRN_GQR4 916
+#define SPRN_GQR5 917
+#define SPRN_GQR6 918
+#define SPRN_GQR7 919
+
+/* Book3S_32 defines mfsrin(v) - but that messes up our abstract
+ * function pointers, so let's just disable the define. */
+#undef mfsrin
+
int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int inst, int *advance)
{
@@ -52,7 +73,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
switch (get_xop(inst)) {
case OP_19_XOP_RFID:
case OP_19_XOP_RFI:
- vcpu->arch.pc = vcpu->arch.srr0;
+ kvmppc_set_pc(vcpu, vcpu->arch.srr0);
kvmppc_set_msr(vcpu, vcpu->arch.srr1);
*advance = 0;
break;
@@ -80,6 +101,18 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
case OP_31_XOP_MTMSR:
kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst)));
break;
+ case OP_31_XOP_MFSR:
+ {
+ int srnum;
+
+ srnum = kvmppc_get_field(inst, 12 + 32, 15 + 32);
+ if (vcpu->arch.mmu.mfsrin) {
+ u32 sr;
+ sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+ kvmppc_set_gpr(vcpu, get_rt(inst), sr);
+ }
+ break;
+ }
case OP_31_XOP_MFSRIN:
{
int srnum;
@@ -92,6 +125,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
break;
}
+ case OP_31_XOP_MTSR:
+ vcpu->arch.mmu.mtsrin(vcpu,
+ (inst >> 16) & 0xf,
+ kvmppc_get_gpr(vcpu, get_rs(inst)));
+ break;
case OP_31_XOP_MTSRIN:
vcpu->arch.mmu.mtsrin(vcpu,
(kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf,
@@ -150,12 +188,17 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
kvmppc_set_gpr(vcpu, get_rt(inst), t);
}
break;
+ case OP_31_XOP_DCBA:
+ /* Gets treated as NOP */
+ break;
case OP_31_XOP_DCBZ:
{
ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst));
ulong ra = 0;
- ulong addr;
+ ulong addr, vaddr;
u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+ u32 dsisr;
+ int r;
if (get_ra(inst))
ra = kvmppc_get_gpr(vcpu, get_ra(inst));
@@ -163,15 +206,25 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
addr = (ra + rb) & ~31ULL;
if (!(vcpu->arch.msr & MSR_SF))
addr &= 0xffffffff;
+ vaddr = addr;
+
+ r = kvmppc_st(vcpu, &addr, 32, zeros, true);
+ if ((r == -ENOENT) || (r == -EPERM)) {
+ *advance = 0;
+ vcpu->arch.dear = vaddr;
+ to_svcpu(vcpu)->fault_dar = vaddr;
+
+ dsisr = DSISR_ISSTORE;
+ if (r == -ENOENT)
+ dsisr |= DSISR_NOHPTE;
+ else if (r == -EPERM)
+ dsisr |= DSISR_PROTFAULT;
+
+ to_book3s(vcpu)->dsisr = dsisr;
+ to_svcpu(vcpu)->fault_dsisr = dsisr;
- if (kvmppc_st(vcpu, addr, 32, zeros)) {
- vcpu->arch.dear = addr;
- vcpu->arch.fault_dear = addr;
- to_book3s(vcpu)->dsisr = DSISR_PROTFAULT |
- DSISR_ISSTORE;
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_DATA_STORAGE);
- kvmppc_mmu_pte_flush(vcpu, addr, ~0xFFFULL);
}
break;
@@ -184,6 +237,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
emulated = EMULATE_FAIL;
}
+ if (emulated == EMULATE_FAIL)
+ emulated = kvmppc_emulate_paired_single(run, vcpu);
+
return emulated;
}
@@ -207,6 +263,34 @@ void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
}
}
+static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
+{
+ struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+ struct kvmppc_bat *bat;
+
+ switch (sprn) {
+ case SPRN_IBAT0U ... SPRN_IBAT3L:
+ bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
+ break;
+ case SPRN_IBAT4U ... SPRN_IBAT7L:
+ bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
+ break;
+ case SPRN_DBAT0U ... SPRN_DBAT3L:
+ bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
+ break;
+ case SPRN_DBAT4U ... SPRN_DBAT7L:
+ bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
+ break;
+ default:
+ BUG();
+ }
+
+ if (sprn % 2)
+ return bat->raw >> 32;
+ else
+ return bat->raw;
+}
+
static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
{
struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
@@ -217,13 +301,13 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
break;
case SPRN_IBAT4U ... SPRN_IBAT7L:
- bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT4U) / 2];
+ bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
break;
case SPRN_DBAT0U ... SPRN_DBAT3L:
bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
break;
case SPRN_DBAT4U ... SPRN_DBAT7L:
- bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT4U) / 2];
+ bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
break;
default:
BUG();
@@ -258,6 +342,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
/* BAT writes happen so rarely that we're ok to flush
* everything here */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
+ kvmppc_mmu_flush_segments(vcpu);
break;
case SPRN_HID0:
to_book3s(vcpu)->hid[0] = spr_val;
@@ -268,7 +353,32 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
case SPRN_HID2:
to_book3s(vcpu)->hid[2] = spr_val;
break;
+ case SPRN_HID2_GEKKO:
+ to_book3s(vcpu)->hid[2] = spr_val;
+ /* HID2.PSE controls paired single on gekko */
+ switch (vcpu->arch.pvr) {
+ case 0x00080200: /* lonestar 2.0 */
+ case 0x00088202: /* lonestar 2.2 */
+ case 0x70000100: /* gekko 1.0 */
+ case 0x00080100: /* gekko 2.0 */
+ case 0x00083203: /* gekko 2.3a */
+ case 0x00083213: /* gekko 2.3b */
+ case 0x00083204: /* gekko 2.4 */
+ case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */
+ case 0x00087200: /* broadway */
+ if (vcpu->arch.hflags & BOOK3S_HFLAG_NATIVE_PS) {
+ /* Native paired singles */
+ } else if (spr_val & (1 << 29)) { /* HID2.PSE */
+ vcpu->arch.hflags |= BOOK3S_HFLAG_PAIRED_SINGLE;
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ } else {
+ vcpu->arch.hflags &= ~BOOK3S_HFLAG_PAIRED_SINGLE;
+ }
+ break;
+ }
+ break;
case SPRN_HID4:
+ case SPRN_HID4_GEKKO:
to_book3s(vcpu)->hid[4] = spr_val;
break;
case SPRN_HID5:
@@ -278,12 +388,30 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
(mfmsr() & MSR_HV))
vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
break;
+ case SPRN_GQR0:
+ case SPRN_GQR1:
+ case SPRN_GQR2:
+ case SPRN_GQR3:
+ case SPRN_GQR4:
+ case SPRN_GQR5:
+ case SPRN_GQR6:
+ case SPRN_GQR7:
+ to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val;
+ break;
case SPRN_ICTC:
case SPRN_THRM1:
case SPRN_THRM2:
case SPRN_THRM3:
case SPRN_CTRLF:
case SPRN_CTRLT:
+ case SPRN_L2CR:
+ case SPRN_MMCR0_GEKKO:
+ case SPRN_MMCR1_GEKKO:
+ case SPRN_PMC1_GEKKO:
+ case SPRN_PMC2_GEKKO:
+ case SPRN_PMC3_GEKKO:
+ case SPRN_PMC4_GEKKO:
+ case SPRN_WPAR_GEKKO:
break;
default:
printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
@@ -301,6 +429,12 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
int emulated = EMULATE_DONE;
switch (sprn) {
+ case SPRN_IBAT0U ... SPRN_IBAT3L:
+ case SPRN_IBAT4U ... SPRN_IBAT7L:
+ case SPRN_DBAT0U ... SPRN_DBAT3L:
+ case SPRN_DBAT4U ... SPRN_DBAT7L:
+ kvmppc_set_gpr(vcpu, rt, kvmppc_read_bat(vcpu, sprn));
+ break;
case SPRN_SDR1:
kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
break;
@@ -320,19 +454,40 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]);
break;
case SPRN_HID2:
+ case SPRN_HID2_GEKKO:
kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]);
break;
case SPRN_HID4:
+ case SPRN_HID4_GEKKO:
kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]);
break;
case SPRN_HID5:
kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
break;
+ case SPRN_GQR0:
+ case SPRN_GQR1:
+ case SPRN_GQR2:
+ case SPRN_GQR3:
+ case SPRN_GQR4:
+ case SPRN_GQR5:
+ case SPRN_GQR6:
+ case SPRN_GQR7:
+ kvmppc_set_gpr(vcpu, rt,
+ to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]);
+ break;
case SPRN_THRM1:
case SPRN_THRM2:
case SPRN_THRM3:
case SPRN_CTRLF:
case SPRN_CTRLT:
+ case SPRN_L2CR:
+ case SPRN_MMCR0_GEKKO:
+ case SPRN_MMCR1_GEKKO:
+ case SPRN_PMC1_GEKKO:
+ case SPRN_PMC2_GEKKO:
+ case SPRN_PMC3_GEKKO:
+ case SPRN_PMC4_GEKKO:
+ case SPRN_WPAR_GEKKO:
kvmppc_set_gpr(vcpu, rt, 0);
break;
default:
@@ -346,3 +501,73 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
return emulated;
}
+u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+ u32 dsisr = 0;
+
+ /*
+ * This is what the spec says about DSISR bits (not mentioned = 0):
+ *
+ * 12:13 [DS] Set to bits 30:31
+ * 15:16 [X] Set to bits 29:30
+ * 17 [X] Set to bit 25
+ * [D/DS] Set to bit 5
+ * 18:21 [X] Set to bits 21:24
+ * [D/DS] Set to bits 1:4
+ * 22:26 Set to bits 6:10 (RT/RS/FRT/FRS)
+ * 27:31 Set to bits 11:15 (RA)
+ */
+
+ switch (get_op(inst)) {
+ /* D-form */
+ case OP_LFS:
+ case OP_LFD:
+ case OP_STFD:
+ case OP_STFS:
+ dsisr |= (inst >> 12) & 0x4000; /* bit 17 */
+ dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */
+ break;
+ /* X-form */
+ case 31:
+ dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */
+ dsisr |= (inst << 8) & 0x04000; /* bit 17 */
+ dsisr |= (inst << 3) & 0x03c00; /* bits 18:21 */
+ break;
+ default:
+ printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
+ break;
+ }
+
+ dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */
+
+ return dsisr;
+}
+
+ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+ ulong dar = 0;
+ ulong ra;
+
+ switch (get_op(inst)) {
+ case OP_LFS:
+ case OP_LFD:
+ case OP_STFD:
+ case OP_STFS:
+ ra = get_ra(inst);
+ if (ra)
+ dar = kvmppc_get_gpr(vcpu, ra);
+ dar += (s32)((s16)inst);
+ break;
+ case 31:
+ ra = get_ra(inst);
+ if (ra)
+ dar = kvmppc_get_gpr(vcpu, ra);
+ dar += kvmppc_get_gpr(vcpu, get_rb(inst));
+ break;
+ default:
+ printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
+ break;
+ }
+
+ return dar;
+}
diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 1dd5a1d..1dd5a1d 100644
--- a/arch/powerpc/kvm/book3s_64_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index c1584d0..2f0bc92 100644
--- a/arch/powerpc/kvm/book3s_64_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -24,36 +24,56 @@
#include <asm/asm-offsets.h>
#include <asm/exception-64s.h>
-#define KVMPPC_HANDLE_EXIT .kvmppc_handle_exit
-#define ULONG_SIZE 8
-#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE))
+#if defined(CONFIG_PPC_BOOK3S_64)
-.macro DISABLE_INTERRUPTS
- mfmsr r0
- rldicl r0,r0,48,1
- rotldi r0,r0,16
- mtmsrd r0,1
-.endm
+#define ULONG_SIZE 8
+#define FUNC(name) GLUE(.,name)
+#define GET_SHADOW_VCPU(reg) \
+ addi reg, r13, PACA_KVM_SVCPU
+
+#define DISABLE_INTERRUPTS \
+ mfmsr r0; \
+ rldicl r0,r0,48,1; \
+ rotldi r0,r0,16; \
+ mtmsrd r0,1; \
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define ULONG_SIZE 4
+#define FUNC(name) name
+
+#define GET_SHADOW_VCPU(reg) \
+ lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+
+#define DISABLE_INTERRUPTS \
+ mfmsr r0; \
+ rlwinm r0,r0,0,17,15; \
+ mtmsr r0; \
+
+#endif /* CONFIG_PPC_BOOK3S_XX */
+
+
+#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE))
#define VCPU_LOAD_NVGPRS(vcpu) \
- ld r14, VCPU_GPR(r14)(vcpu); \
- ld r15, VCPU_GPR(r15)(vcpu); \
- ld r16, VCPU_GPR(r16)(vcpu); \
- ld r17, VCPU_GPR(r17)(vcpu); \
- ld r18, VCPU_GPR(r18)(vcpu); \
- ld r19, VCPU_GPR(r19)(vcpu); \
- ld r20, VCPU_GPR(r20)(vcpu); \
- ld r21, VCPU_GPR(r21)(vcpu); \
- ld r22, VCPU_GPR(r22)(vcpu); \
- ld r23, VCPU_GPR(r23)(vcpu); \
- ld r24, VCPU_GPR(r24)(vcpu); \
- ld r25, VCPU_GPR(r25)(vcpu); \
- ld r26, VCPU_GPR(r26)(vcpu); \
- ld r27, VCPU_GPR(r27)(vcpu); \
- ld r28, VCPU_GPR(r28)(vcpu); \
- ld r29, VCPU_GPR(r29)(vcpu); \
- ld r30, VCPU_GPR(r30)(vcpu); \
- ld r31, VCPU_GPR(r31)(vcpu); \
+ PPC_LL r14, VCPU_GPR(r14)(vcpu); \
+ PPC_LL r15, VCPU_GPR(r15)(vcpu); \
+ PPC_LL r16, VCPU_GPR(r16)(vcpu); \
+ PPC_LL r17, VCPU_GPR(r17)(vcpu); \
+ PPC_LL r18, VCPU_GPR(r18)(vcpu); \
+ PPC_LL r19, VCPU_GPR(r19)(vcpu); \
+ PPC_LL r20, VCPU_GPR(r20)(vcpu); \
+ PPC_LL r21, VCPU_GPR(r21)(vcpu); \
+ PPC_LL r22, VCPU_GPR(r22)(vcpu); \
+ PPC_LL r23, VCPU_GPR(r23)(vcpu); \
+ PPC_LL r24, VCPU_GPR(r24)(vcpu); \
+ PPC_LL r25, VCPU_GPR(r25)(vcpu); \
+ PPC_LL r26, VCPU_GPR(r26)(vcpu); \
+ PPC_LL r27, VCPU_GPR(r27)(vcpu); \
+ PPC_LL r28, VCPU_GPR(r28)(vcpu); \
+ PPC_LL r29, VCPU_GPR(r29)(vcpu); \
+ PPC_LL r30, VCPU_GPR(r30)(vcpu); \
+ PPC_LL r31, VCPU_GPR(r31)(vcpu); \
/*****************************************************************************
* *
@@ -69,11 +89,11 @@ _GLOBAL(__kvmppc_vcpu_entry)
kvm_start_entry:
/* Write correct stack frame */
- mflr r0
- std r0,16(r1)
+ mflr r0
+ PPC_STL r0,PPC_LR_STKOFF(r1)
/* Save host state to the stack */
- stdu r1, -SWITCH_FRAME_SIZE(r1)
+ PPC_STLU r1, -SWITCH_FRAME_SIZE(r1)
/* Save r3 (kvm_run) and r4 (vcpu) */
SAVE_2GPRS(3, r1)
@@ -82,33 +102,28 @@ kvm_start_entry:
SAVE_NVGPRS(r1)
/* Save LR */
- std r0, _LINK(r1)
+ PPC_STL r0, _LINK(r1)
/* Load non-volatile guest state from the vcpu */
VCPU_LOAD_NVGPRS(r4)
+ GET_SHADOW_VCPU(r5)
+
/* Save R1/R2 in the PACA */
- std r1, PACA_KVM_HOST_R1(r13)
- std r2, PACA_KVM_HOST_R2(r13)
+ PPC_STL r1, SVCPU_HOST_R1(r5)
+ PPC_STL r2, SVCPU_HOST_R2(r5)
/* XXX swap in/out on load? */
- ld r3, VCPU_HIGHMEM_HANDLER(r4)
- std r3, PACA_KVM_VMHANDLER(r13)
+ PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4)
+ PPC_STL r3, SVCPU_VMHANDLER(r5)
kvm_start_lightweight:
- ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
- ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
-
- /* Load some guest state in the respective registers */
- ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */
- /* will be swapped in by rmcall */
-
- ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
- mtlr r3 /* LR = r3 */
+ PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
DISABLE_INTERRUPTS
+#ifdef CONFIG_PPC_BOOK3S_64
/* Some guests may need to have dcbz set to 32 byte length.
*
* Usually we ensure that by patching the guest's instructions
@@ -118,7 +133,7 @@ kvm_start_lightweight:
* because that's a lot faster.
*/
- ld r3, VCPU_HFLAGS(r4)
+ PPC_LL r3, VCPU_HFLAGS(r4)
rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */
beq no_dcbz32_on
@@ -128,13 +143,15 @@ kvm_start_lightweight:
no_dcbz32_on:
- ld r6, VCPU_RMCALL(r4)
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+ PPC_LL r6, VCPU_RMCALL(r4)
mtctr r6
- ld r3, VCPU_TRAMPOLINE_ENTER(r4)
+ PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4)
LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
- /* Jump to SLB patching handlder and into our guest */
+ /* Jump to segment patching handler and into our guest */
bctr
/*
@@ -149,31 +166,20 @@ kvmppc_handler_highmem:
/*
* Register usage at this point:
*
- * R0 = guest last inst
- * R1 = host R1
- * R2 = host R2
- * R3 = guest PC
- * R4 = guest MSR
- * R5 = guest DAR
- * R6 = guest DSISR
- * R13 = PACA
- * PACA.KVM.* = guest *
+ * R1 = host R1
+ * R2 = host R2
+ * R12 = exit handler id
+ * R13 = PACA
+ * SVCPU.* = guest *
*
*/
/* R7 = vcpu */
- ld r7, GPR4(r1)
+ PPC_LL r7, GPR4(r1)
- /* Now save the guest state */
+#ifdef CONFIG_PPC_BOOK3S_64
- stw r0, VCPU_LAST_INST(r7)
-
- std r3, VCPU_PC(r7)
- std r4, VCPU_SHADOW_SRR1(r7)
- std r5, VCPU_FAULT_DEAR(r7)
- std r6, VCPU_FAULT_DSISR(r7)
-
- ld r5, VCPU_HFLAGS(r7)
+ PPC_LL r5, VCPU_HFLAGS(r7)
rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */
beq no_dcbz32_off
@@ -184,35 +190,29 @@ kvmppc_handler_highmem:
no_dcbz32_off:
- std r14, VCPU_GPR(r14)(r7)
- std r15, VCPU_GPR(r15)(r7)
- std r16, VCPU_GPR(r16)(r7)
- std r17, VCPU_GPR(r17)(r7)
- std r18, VCPU_GPR(r18)(r7)
- std r19, VCPU_GPR(r19)(r7)
- std r20, VCPU_GPR(r20)(r7)
- std r21, VCPU_GPR(r21)(r7)
- std r22, VCPU_GPR(r22)(r7)
- std r23, VCPU_GPR(r23)(r7)
- std r24, VCPU_GPR(r24)(r7)
- std r25, VCPU_GPR(r25)(r7)
- std r26, VCPU_GPR(r26)(r7)
- std r27, VCPU_GPR(r27)(r7)
- std r28, VCPU_GPR(r28)(r7)
- std r29, VCPU_GPR(r29)(r7)
- std r30, VCPU_GPR(r30)(r7)
- std r31, VCPU_GPR(r31)(r7)
-
- /* Save guest CTR */
- mfctr r5
- std r5, VCPU_CTR(r7)
-
- /* Save guest LR */
- mflr r5
- std r5, VCPU_LR(r7)
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+ PPC_STL r14, VCPU_GPR(r14)(r7)
+ PPC_STL r15, VCPU_GPR(r15)(r7)
+ PPC_STL r16, VCPU_GPR(r16)(r7)
+ PPC_STL r17, VCPU_GPR(r17)(r7)
+ PPC_STL r18, VCPU_GPR(r18)(r7)
+ PPC_STL r19, VCPU_GPR(r19)(r7)
+ PPC_STL r20, VCPU_GPR(r20)(r7)
+ PPC_STL r21, VCPU_GPR(r21)(r7)
+ PPC_STL r22, VCPU_GPR(r22)(r7)
+ PPC_STL r23, VCPU_GPR(r23)(r7)
+ PPC_STL r24, VCPU_GPR(r24)(r7)
+ PPC_STL r25, VCPU_GPR(r25)(r7)
+ PPC_STL r26, VCPU_GPR(r26)(r7)
+ PPC_STL r27, VCPU_GPR(r27)(r7)
+ PPC_STL r28, VCPU_GPR(r28)(r7)
+ PPC_STL r29, VCPU_GPR(r29)(r7)
+ PPC_STL r30, VCPU_GPR(r30)(r7)
+ PPC_STL r31, VCPU_GPR(r31)(r7)
/* Restore host msr -> SRR1 */
- ld r6, VCPU_HOST_MSR(r7)
+ PPC_LL r6, VCPU_HOST_MSR(r7)
/*
* For some interrupts, we need to call the real Linux
@@ -228,9 +228,12 @@ no_dcbz32_off:
beq call_linux_handler
cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
beq call_linux_handler
+ cmpwi r12, BOOK3S_INTERRUPT_PERFMON
+ beq call_linux_handler
/* Back to EE=1 */
mtmsr r6
+ sync
b kvm_return_point
call_linux_handler:
@@ -249,14 +252,14 @@ call_linux_handler:
*/
/* Restore host IP -> SRR0 */
- ld r5, VCPU_HOST_RETIP(r7)
+ PPC_LL r5, VCPU_HOST_RETIP(r7)
/* XXX Better move to a safe function?
* What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
mtlr r12
- ld r4, VCPU_TRAMPOLINE_LOWMEM(r7)
+ PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7)
mtsrr0 r4
LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
mtsrr1 r3
@@ -274,7 +277,7 @@ kvm_return_point:
/* Restore r3 (kvm_run) and r4 (vcpu) */
REST_2GPRS(3, r1)
- bl KVMPPC_HANDLE_EXIT
+ bl FUNC(kvmppc_handle_exit)
/* If RESUME_GUEST, get back in the loop */
cmpwi r3, RESUME_GUEST
@@ -285,7 +288,7 @@ kvm_return_point:
kvm_exit_loop:
- ld r4, _LINK(r1)
+ PPC_LL r4, _LINK(r1)
mtlr r4
/* Restore non-volatile host registers (r14 - r31) */
@@ -296,8 +299,8 @@ kvm_exit_loop:
kvm_loop_heavyweight:
- ld r4, _LINK(r1)
- std r4, (16 + SWITCH_FRAME_SIZE)(r1)
+ PPC_LL r4, _LINK(r1)
+ PPC_STL r4, (PPC_LR_STKOFF + SWITCH_FRAME_SIZE)(r1)
/* Load vcpu and cpu_run */
REST_2GPRS(3, r1)
@@ -315,4 +318,3 @@ kvm_loop_lightweight:
/* Jump back into the beginning of this function */
b kvm_start_lightweight
-
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
new file mode 100644
index 0000000..a9f66ab
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -0,0 +1,1289 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright Novell Inc 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_fpu.h>
+#include <asm/reg.h>
+#include <asm/cacheflush.h>
+#include <linux/vmalloc.h>
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+#define dprintk printk
+#else
+#define dprintk(...) do { } while(0);
+#endif
+
+#define OP_LFS 48
+#define OP_LFSU 49
+#define OP_LFD 50
+#define OP_LFDU 51
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD 54
+#define OP_STFDU 55
+#define OP_PSQ_L 56
+#define OP_PSQ_LU 57
+#define OP_PSQ_ST 60
+#define OP_PSQ_STU 61
+
+#define OP_31_LFSX 535
+#define OP_31_LFSUX 567
+#define OP_31_LFDX 599
+#define OP_31_LFDUX 631
+#define OP_31_STFSX 663
+#define OP_31_STFSUX 695
+#define OP_31_STFX 727
+#define OP_31_STFUX 759
+#define OP_31_LWIZX 887
+#define OP_31_STFIWX 983
+
+#define OP_59_FADDS 21
+#define OP_59_FSUBS 20
+#define OP_59_FSQRTS 22
+#define OP_59_FDIVS 18
+#define OP_59_FRES 24
+#define OP_59_FMULS 25
+#define OP_59_FRSQRTES 26
+#define OP_59_FMSUBS 28
+#define OP_59_FMADDS 29
+#define OP_59_FNMSUBS 30
+#define OP_59_FNMADDS 31
+
+#define OP_63_FCMPU 0
+#define OP_63_FCPSGN 8
+#define OP_63_FRSP 12
+#define OP_63_FCTIW 14
+#define OP_63_FCTIWZ 15
+#define OP_63_FDIV 18
+#define OP_63_FADD 21
+#define OP_63_FSQRT 22
+#define OP_63_FSEL 23
+#define OP_63_FRE 24
+#define OP_63_FMUL 25
+#define OP_63_FRSQRTE 26
+#define OP_63_FMSUB 28
+#define OP_63_FMADD 29
+#define OP_63_FNMSUB 30
+#define OP_63_FNMADD 31
+#define OP_63_FCMPO 32
+#define OP_63_MTFSB1 38 // XXX
+#define OP_63_FSUB 20
+#define OP_63_FNEG 40
+#define OP_63_MCRFS 64
+#define OP_63_MTFSB0 70
+#define OP_63_FMR 72
+#define OP_63_MTFSFI 134
+#define OP_63_FABS 264
+#define OP_63_MFFS 583
+#define OP_63_MTFSF 711
+
+#define OP_4X_PS_CMPU0 0
+#define OP_4X_PSQ_LX 6
+#define OP_4XW_PSQ_STX 7
+#define OP_4A_PS_SUM0 10
+#define OP_4A_PS_SUM1 11
+#define OP_4A_PS_MULS0 12
+#define OP_4A_PS_MULS1 13
+#define OP_4A_PS_MADDS0 14
+#define OP_4A_PS_MADDS1 15
+#define OP_4A_PS_DIV 18
+#define OP_4A_PS_SUB 20
+#define OP_4A_PS_ADD 21
+#define OP_4A_PS_SEL 23
+#define OP_4A_PS_RES 24
+#define OP_4A_PS_MUL 25
+#define OP_4A_PS_RSQRTE 26
+#define OP_4A_PS_MSUB 28
+#define OP_4A_PS_MADD 29
+#define OP_4A_PS_NMSUB 30
+#define OP_4A_PS_NMADD 31
+#define OP_4X_PS_CMPO0 32
+#define OP_4X_PSQ_LUX 38
+#define OP_4XW_PSQ_STUX 39
+#define OP_4X_PS_NEG 40
+#define OP_4X_PS_CMPU1 64
+#define OP_4X_PS_MR 72
+#define OP_4X_PS_CMPO1 96
+#define OP_4X_PS_NABS 136
+#define OP_4X_PS_ABS 264
+#define OP_4X_PS_MERGE00 528
+#define OP_4X_PS_MERGE01 560
+#define OP_4X_PS_MERGE10 592
+#define OP_4X_PS_MERGE11 624
+
+#define SCALAR_NONE 0
+#define SCALAR_HIGH (1 << 0)
+#define SCALAR_LOW (1 << 1)
+#define SCALAR_NO_PS0 (1 << 2)
+#define SCALAR_NO_PS1 (1 << 3)
+
+#define GQR_ST_TYPE_MASK 0x00000007
+#define GQR_ST_TYPE_SHIFT 0
+#define GQR_ST_SCALE_MASK 0x00003f00
+#define GQR_ST_SCALE_SHIFT 8
+#define GQR_LD_TYPE_MASK 0x00070000
+#define GQR_LD_TYPE_SHIFT 16
+#define GQR_LD_SCALE_MASK 0x3f000000
+#define GQR_LD_SCALE_SHIFT 24
+
+#define GQR_QUANTIZE_FLOAT 0
+#define GQR_QUANTIZE_U8 4
+#define GQR_QUANTIZE_U16 5
+#define GQR_QUANTIZE_S8 6
+#define GQR_QUANTIZE_S16 7
+
+#define FPU_LS_SINGLE 0
+#define FPU_LS_DOUBLE 1
+#define FPU_LS_SINGLE_LOW 2
+
+static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
+{
+ struct thread_struct t;
+
+ t.fpscr.val = vcpu->arch.fpscr;
+ cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
+}
+
+static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
+{
+ u64 dsisr;
+
+ vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 36, 0);
+ vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
+ vcpu->arch.dear = eaddr;
+ /* Page Fault */
+ dsisr = kvmppc_set_field(0, 33, 33, 1);
+ if (is_store)
+ to_book3s(vcpu)->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+ kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
+}
+
+static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ int rs, ulong addr, int ls_type)
+{
+ int emulated = EMULATE_FAIL;
+ struct thread_struct t;
+ int r;
+ char tmp[8];
+ int len = sizeof(u32);
+
+ if (ls_type == FPU_LS_DOUBLE)
+ len = sizeof(u64);
+
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ /* read from memory */
+ r = kvmppc_ld(vcpu, &addr, len, tmp, true);
+ vcpu->arch.paddr_accessed = addr;
+
+ if (r < 0) {
+ kvmppc_inject_pf(vcpu, addr, false);
+ goto done_load;
+ } else if (r == EMULATE_DO_MMIO) {
+ emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1);
+ goto done_load;
+ }
+
+ emulated = EMULATE_DONE;
+
+ /* put in registers */
+ switch (ls_type) {
+ case FPU_LS_SINGLE:
+ cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t);
+ vcpu->arch.qpr[rs] = *((u32*)tmp);
+ break;
+ case FPU_LS_DOUBLE:
+ vcpu->arch.fpr[rs] = *((u64*)tmp);
+ break;
+ }
+
+ dprintk(KERN_INFO "KVM: FPR_LD [0x%llx] at 0x%lx (%d)\n", *(u64*)tmp,
+ addr, len);
+
+done_load:
+ return emulated;
+}
+
+static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ int rs, ulong addr, int ls_type)
+{
+ int emulated = EMULATE_FAIL;
+ struct thread_struct t;
+ int r;
+ char tmp[8];
+ u64 val;
+ int len;
+
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ switch (ls_type) {
+ case FPU_LS_SINGLE:
+ cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t);
+ val = *((u32*)tmp);
+ len = sizeof(u32);
+ break;
+ case FPU_LS_SINGLE_LOW:
+ *((u32*)tmp) = vcpu->arch.fpr[rs];
+ val = vcpu->arch.fpr[rs] & 0xffffffff;
+ len = sizeof(u32);
+ break;
+ case FPU_LS_DOUBLE:
+ *((u64*)tmp) = vcpu->arch.fpr[rs];
+ val = vcpu->arch.fpr[rs];
+ len = sizeof(u64);
+ break;
+ default:
+ val = 0;
+ len = 0;
+ }
+
+ r = kvmppc_st(vcpu, &addr, len, tmp, true);
+ vcpu->arch.paddr_accessed = addr;
+ if (r < 0) {
+ kvmppc_inject_pf(vcpu, addr, true);
+ } else if (r == EMULATE_DO_MMIO) {
+ emulated = kvmppc_handle_store(run, vcpu, val, len, 1);
+ } else {
+ emulated = EMULATE_DONE;
+ }
+
+ dprintk(KERN_INFO "KVM: FPR_ST [0x%llx] at 0x%lx (%d)\n",
+ val, addr, len);
+
+ return emulated;
+}
+
+static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ int rs, ulong addr, bool w, int i)
+{
+ int emulated = EMULATE_FAIL;
+ struct thread_struct t;
+ int r;
+ float one = 1.0;
+ u32 tmp[2];
+
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ /* read from memory */
+ if (w) {
+ r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
+ memcpy(&tmp[1], &one, sizeof(u32));
+ } else {
+ r = kvmppc_ld(vcpu, &addr, sizeof(u32) * 2, tmp, true);
+ }
+ vcpu->arch.paddr_accessed = addr;
+ if (r < 0) {
+ kvmppc_inject_pf(vcpu, addr, false);
+ goto done_load;
+ } else if ((r == EMULATE_DO_MMIO) && w) {
+ emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1);
+ vcpu->arch.qpr[rs] = tmp[1];
+ goto done_load;
+ } else if (r == EMULATE_DO_MMIO) {
+ emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1);
+ goto done_load;
+ }
+
+ emulated = EMULATE_DONE;
+
+ /* put in registers */
+ cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t);
+ vcpu->arch.qpr[rs] = tmp[1];
+
+ dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
+ tmp[1], addr, w ? 4 : 8);
+
+done_load:
+ return emulated;
+}
+
+static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ int rs, ulong addr, bool w, int i)
+{
+ int emulated = EMULATE_FAIL;
+ struct thread_struct t;
+ int r;
+ u32 tmp[2];
+ int len = w ? sizeof(u32) : sizeof(u64);
+
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
+ tmp[1] = vcpu->arch.qpr[rs];
+
+ r = kvmppc_st(vcpu, &addr, len, tmp, true);
+ vcpu->arch.paddr_accessed = addr;
+ if (r < 0) {
+ kvmppc_inject_pf(vcpu, addr, true);
+ } else if ((r == EMULATE_DO_MMIO) && w) {
+ emulated = kvmppc_handle_store(run, vcpu, tmp[0], 4, 1);
+ } else if (r == EMULATE_DO_MMIO) {
+ u64 val = ((u64)tmp[0] << 32) | tmp[1];
+ emulated = kvmppc_handle_store(run, vcpu, val, 8, 1);
+ } else {
+ emulated = EMULATE_DONE;
+ }
+
+ dprintk(KERN_INFO "KVM: PSQ_ST [0x%x, 0x%x] at 0x%lx (%d)\n",
+ tmp[0], tmp[1], addr, len);
+
+ return emulated;
+}
+
+/*
+ * Cuts out inst bits with ordering according to spec.
+ * That means the leftmost bit is zero. All given bits are included.
+ */
+static inline u32 inst_get_field(u32 inst, int msb, int lsb)
+{
+ return kvmppc_get_field(inst, msb + 32, lsb + 32);
+}
+
+/*
+ * Replaces inst bits with ordering according to spec.
+ */
+static inline u32 inst_set_field(u32 inst, int msb, int lsb, int value)
+{
+ return kvmppc_set_field(inst, msb + 32, lsb + 32, value);
+}
+
+bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+{
+ if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+ return false;
+
+ switch (get_op(inst)) {
+ case OP_PSQ_L:
+ case OP_PSQ_LU:
+ case OP_PSQ_ST:
+ case OP_PSQ_STU:
+ case OP_LFS:
+ case OP_LFSU:
+ case OP_LFD:
+ case OP_LFDU:
+ case OP_STFS:
+ case OP_STFSU:
+ case OP_STFD:
+ case OP_STFDU:
+ return true;
+ case 4:
+ /* X form */
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_4X_PS_CMPU0:
+ case OP_4X_PSQ_LX:
+ case OP_4X_PS_CMPO0:
+ case OP_4X_PSQ_LUX:
+ case OP_4X_PS_NEG:
+ case OP_4X_PS_CMPU1:
+ case OP_4X_PS_MR:
+ case OP_4X_PS_CMPO1:
+ case OP_4X_PS_NABS:
+ case OP_4X_PS_ABS:
+ case OP_4X_PS_MERGE00:
+ case OP_4X_PS_MERGE01:
+ case OP_4X_PS_MERGE10:
+ case OP_4X_PS_MERGE11:
+ return true;
+ }
+ /* XW form */
+ switch (inst_get_field(inst, 25, 30)) {
+ case OP_4XW_PSQ_STX:
+ case OP_4XW_PSQ_STUX:
+ return true;
+ }
+ /* A form */
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_4A_PS_SUM1:
+ case OP_4A_PS_SUM0:
+ case OP_4A_PS_MULS0:
+ case OP_4A_PS_MULS1:
+ case OP_4A_PS_MADDS0:
+ case OP_4A_PS_MADDS1:
+ case OP_4A_PS_DIV:
+ case OP_4A_PS_SUB:
+ case OP_4A_PS_ADD:
+ case OP_4A_PS_SEL:
+ case OP_4A_PS_RES:
+ case OP_4A_PS_MUL:
+ case OP_4A_PS_RSQRTE:
+ case OP_4A_PS_MSUB:
+ case OP_4A_PS_MADD:
+ case OP_4A_PS_NMSUB:
+ case OP_4A_PS_NMADD:
+ return true;
+ }
+ break;
+ case 59:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_59_FADDS:
+ case OP_59_FSUBS:
+ case OP_59_FDIVS:
+ case OP_59_FRES:
+ case OP_59_FRSQRTES:
+ return true;
+ }
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_59_FMULS:
+ case OP_59_FMSUBS:
+ case OP_59_FMADDS:
+ case OP_59_FNMSUBS:
+ case OP_59_FNMADDS:
+ return true;
+ }
+ break;
+ case 63:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_63_MTFSB0:
+ case OP_63_MTFSB1:
+ case OP_63_MTFSF:
+ case OP_63_MTFSFI:
+ case OP_63_MCRFS:
+ case OP_63_MFFS:
+ case OP_63_FCMPU:
+ case OP_63_FCMPO:
+ case OP_63_FNEG:
+ case OP_63_FMR:
+ case OP_63_FABS:
+ case OP_63_FRSP:
+ case OP_63_FDIV:
+ case OP_63_FADD:
+ case OP_63_FSUB:
+ case OP_63_FCTIW:
+ case OP_63_FCTIWZ:
+ case OP_63_FRSQRTE:
+ case OP_63_FCPSGN:
+ return true;
+ }
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_63_FMUL:
+ case OP_63_FSEL:
+ case OP_63_FMSUB:
+ case OP_63_FMADD:
+ case OP_63_FNMSUB:
+ case OP_63_FNMADD:
+ return true;
+ }
+ break;
+ case 31:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_31_LFSX:
+ case OP_31_LFSUX:
+ case OP_31_LFDX:
+ case OP_31_LFDUX:
+ case OP_31_STFSX:
+ case OP_31_STFSUX:
+ case OP_31_STFX:
+ case OP_31_STFUX:
+ case OP_31_STFIWX:
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+static int get_d_signext(u32 inst)
+{
+ int d = inst & 0x8ff;
+
+ if (d & 0x800)
+ return -(d & 0x7ff);
+
+ return (d & 0x7ff);
+}
+
+static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
+ int reg_out, int reg_in1, int reg_in2,
+ int reg_in3, int scalar,
+ void (*func)(struct thread_struct *t,
+ u32 *dst, u32 *src1,
+ u32 *src2, u32 *src3))
+{
+ u32 *qpr = vcpu->arch.qpr;
+ u64 *fpr = vcpu->arch.fpr;
+ u32 ps0_out;
+ u32 ps0_in1, ps0_in2, ps0_in3;
+ u32 ps1_in1, ps1_in2, ps1_in3;
+ struct thread_struct t;
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ /* RC */
+ WARN_ON(rc);
+
+ /* PS0 */
+ cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
+ cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
+ cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t);
+
+ if (scalar & SCALAR_LOW)
+ ps0_in2 = qpr[reg_in2];
+
+ func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+
+ dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+ ps0_in1, ps0_in2, ps0_in3, ps0_out);
+
+ if (!(scalar & SCALAR_NO_PS0))
+ cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+
+ /* PS1 */
+ ps1_in1 = qpr[reg_in1];
+ ps1_in2 = qpr[reg_in2];
+ ps1_in3 = qpr[reg_in3];
+
+ if (scalar & SCALAR_HIGH)
+ ps1_in2 = ps0_in2;
+
+ if (!(scalar & SCALAR_NO_PS1))
+ func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+
+ dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+ ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
+
+ return EMULATE_DONE;
+}
+
+static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
+ int reg_out, int reg_in1, int reg_in2,
+ int scalar,
+ void (*func)(struct thread_struct *t,
+ u32 *dst, u32 *src1,
+ u32 *src2))
+{
+ u32 *qpr = vcpu->arch.qpr;
+ u64 *fpr = vcpu->arch.fpr;
+ u32 ps0_out;
+ u32 ps0_in1, ps0_in2;
+ u32 ps1_out;
+ u32 ps1_in1, ps1_in2;
+ struct thread_struct t;
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ /* RC */
+ WARN_ON(rc);
+
+ /* PS0 */
+ cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
+
+ if (scalar & SCALAR_LOW)
+ ps0_in2 = qpr[reg_in2];
+ else
+ cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
+
+ func(&t, &ps0_out, &ps0_in1, &ps0_in2);
+
+ if (!(scalar & SCALAR_NO_PS0)) {
+ dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
+ ps0_in1, ps0_in2, ps0_out);
+
+ cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+ }
+
+ /* PS1 */
+ ps1_in1 = qpr[reg_in1];
+ ps1_in2 = qpr[reg_in2];
+
+ if (scalar & SCALAR_HIGH)
+ ps1_in2 = ps0_in2;
+
+ func(&t, &ps1_out, &ps1_in1, &ps1_in2);
+
+ if (!(scalar & SCALAR_NO_PS1)) {
+ qpr[reg_out] = ps1_out;
+
+ dprintk(KERN_INFO "PS2 ps1 -> f(0x%x, 0x%x) = 0x%x\n",
+ ps1_in1, ps1_in2, qpr[reg_out]);
+ }
+
+ return EMULATE_DONE;
+}
+
+static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
+ int reg_out, int reg_in,
+ void (*func)(struct thread_struct *t,
+ u32 *dst, u32 *src1))
+{
+ u32 *qpr = vcpu->arch.qpr;
+ u64 *fpr = vcpu->arch.fpr;
+ u32 ps0_out, ps0_in;
+ u32 ps1_in;
+ struct thread_struct t;
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ /* RC */
+ WARN_ON(rc);
+
+ /* PS0 */
+ cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t);
+ func(&t, &ps0_out, &ps0_in);
+
+ dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
+ ps0_in, ps0_out);
+
+ cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+
+ /* PS1 */
+ ps1_in = qpr[reg_in];
+ func(&t, &qpr[reg_out], &ps1_in);
+
+ dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
+ ps1_in, qpr[reg_out]);
+
+ return EMULATE_DONE;
+}
+
+int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+ u32 inst = kvmppc_get_last_inst(vcpu);
+ enum emulation_result emulated = EMULATE_DONE;
+
+ int ax_rd = inst_get_field(inst, 6, 10);
+ int ax_ra = inst_get_field(inst, 11, 15);
+ int ax_rb = inst_get_field(inst, 16, 20);
+ int ax_rc = inst_get_field(inst, 21, 25);
+ short full_d = inst_get_field(inst, 16, 31);
+
+ u64 *fpr_d = &vcpu->arch.fpr[ax_rd];
+ u64 *fpr_a = &vcpu->arch.fpr[ax_ra];
+ u64 *fpr_b = &vcpu->arch.fpr[ax_rb];
+ u64 *fpr_c = &vcpu->arch.fpr[ax_rc];
+
+ bool rcomp = (inst & 1) ? true : false;
+ u32 cr = kvmppc_get_cr(vcpu);
+ struct thread_struct t;
+#ifdef DEBUG
+ int i;
+#endif
+
+ t.fpscr.val = vcpu->arch.fpscr;
+
+ if (!kvmppc_inst_is_paired_single(vcpu, inst))
+ return EMULATE_FAIL;
+
+ if (!(vcpu->arch.msr & MSR_FP)) {
+ kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
+ return EMULATE_AGAIN;
+ }
+
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ preempt_disable();
+ enable_kernel_fp();
+ /* Do we need to clear FE0 / FE1 here? Don't think so. */
+
+#ifdef DEBUG
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
+ u32 f;
+ cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+ dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n",
+ i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
+ }
+#endif
+
+ switch (get_op(inst)) {
+ case OP_PSQ_L:
+ {
+ ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+ bool w = inst_get_field(inst, 16, 16) ? true : false;
+ int i = inst_get_field(inst, 17, 19);
+
+ addr += get_d_signext(inst);
+ emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+ break;
+ }
+ case OP_PSQ_LU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+ bool w = inst_get_field(inst, 16, 16) ? true : false;
+ int i = inst_get_field(inst, 17, 19);
+
+ addr += get_d_signext(inst);
+ emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_PSQ_ST:
+ {
+ ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+ bool w = inst_get_field(inst, 16, 16) ? true : false;
+ int i = inst_get_field(inst, 17, 19);
+
+ addr += get_d_signext(inst);
+ emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+ break;
+ }
+ case OP_PSQ_STU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+ bool w = inst_get_field(inst, 16, 16) ? true : false;
+ int i = inst_get_field(inst, 17, 19);
+
+ addr += get_d_signext(inst);
+ emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case 4:
+ /* X form */
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_4X_PS_CMPU0:
+ /* XXX */
+ emulated = EMULATE_FAIL;
+ break;
+ case OP_4X_PSQ_LX:
+ {
+ ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+ bool w = inst_get_field(inst, 21, 21) ? true : false;
+ int i = inst_get_field(inst, 22, 24);
+
+ addr += kvmppc_get_gpr(vcpu, ax_rb);
+ emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+ break;
+ }
+ case OP_4X_PS_CMPO0:
+ /* XXX */
+ emulated = EMULATE_FAIL;
+ break;
+ case OP_4X_PSQ_LUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+ bool w = inst_get_field(inst, 21, 21) ? true : false;
+ int i = inst_get_field(inst, 22, 24);
+
+ addr += kvmppc_get_gpr(vcpu, ax_rb);
+ emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_4X_PS_NEG:
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
+ vcpu->arch.fpr[ax_rd] ^= 0x8000000000000000ULL;
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ vcpu->arch.qpr[ax_rd] ^= 0x80000000;
+ break;
+ case OP_4X_PS_CMPU1:
+ /* XXX */
+ emulated = EMULATE_FAIL;
+ break;
+ case OP_4X_PS_MR:
+ WARN_ON(rcomp);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ break;
+ case OP_4X_PS_CMPO1:
+ /* XXX */
+ emulated = EMULATE_FAIL;
+ break;
+ case OP_4X_PS_NABS:
+ WARN_ON(rcomp);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
+ vcpu->arch.fpr[ax_rd] |= 0x8000000000000000ULL;
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ vcpu->arch.qpr[ax_rd] |= 0x80000000;
+ break;
+ case OP_4X_PS_ABS:
+ WARN_ON(rcomp);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
+ vcpu->arch.fpr[ax_rd] &= ~0x8000000000000000ULL;
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ vcpu->arch.qpr[ax_rd] &= ~0x80000000;
+ break;
+ case OP_4X_PS_MERGE00:
+ WARN_ON(rcomp);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
+ /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
+ cvt_df((double*)&vcpu->arch.fpr[ax_rb],
+ (float*)&vcpu->arch.qpr[ax_rd], &t);
+ break;
+ case OP_4X_PS_MERGE01:
+ WARN_ON(rcomp);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ break;
+ case OP_4X_PS_MERGE10:
+ WARN_ON(rcomp);
+ /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
+ cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
+ (double*)&vcpu->arch.fpr[ax_rd], &t);
+ /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
+ cvt_df((double*)&vcpu->arch.fpr[ax_rb],
+ (float*)&vcpu->arch.qpr[ax_rd], &t);
+ break;
+ case OP_4X_PS_MERGE11:
+ WARN_ON(rcomp);
+ /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
+ cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
+ (double*)&vcpu->arch.fpr[ax_rd], &t);
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+ break;
+ }
+ /* XW form */
+ switch (inst_get_field(inst, 25, 30)) {
+ case OP_4XW_PSQ_STX:
+ {
+ ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+ bool w = inst_get_field(inst, 21, 21) ? true : false;
+ int i = inst_get_field(inst, 22, 24);
+
+ addr += kvmppc_get_gpr(vcpu, ax_rb);
+ emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+ break;
+ }
+ case OP_4XW_PSQ_STUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+ bool w = inst_get_field(inst, 21, 21) ? true : false;
+ int i = inst_get_field(inst, 22, 24);
+
+ addr += kvmppc_get_gpr(vcpu, ax_rb);
+ emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ }
+ /* A form */
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_4A_PS_SUM1:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds);
+ vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rc];
+ break;
+ case OP_4A_PS_SUM0:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rb, SCALAR_NO_PS1 | SCALAR_LOW, fps_fadds);
+ vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rc];
+ break;
+ case OP_4A_PS_MULS0:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, SCALAR_HIGH, fps_fmuls);
+ break;
+ case OP_4A_PS_MULS1:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, SCALAR_LOW, fps_fmuls);
+ break;
+ case OP_4A_PS_MADDS0:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_HIGH, fps_fmadds);
+ break;
+ case OP_4A_PS_MADDS1:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_LOW, fps_fmadds);
+ break;
+ case OP_4A_PS_DIV:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rb, SCALAR_NONE, fps_fdivs);
+ break;
+ case OP_4A_PS_SUB:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rb, SCALAR_NONE, fps_fsubs);
+ break;
+ case OP_4A_PS_ADD:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rb, SCALAR_NONE, fps_fadds);
+ break;
+ case OP_4A_PS_SEL:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fsel);
+ break;
+ case OP_4A_PS_RES:
+ emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+ ax_rb, fps_fres);
+ break;
+ case OP_4A_PS_MUL:
+ emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, SCALAR_NONE, fps_fmuls);
+ break;
+ case OP_4A_PS_RSQRTE:
+ emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+ ax_rb, fps_frsqrte);
+ break;
+ case OP_4A_PS_MSUB:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmsubs);
+ break;
+ case OP_4A_PS_MADD:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmadds);
+ break;
+ case OP_4A_PS_NMSUB:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmsubs);
+ break;
+ case OP_4A_PS_NMADD:
+ emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+ ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmadds);
+ break;
+ }
+ break;
+
+ /* Real FPU operations */
+
+ case OP_LFS:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+ FPU_LS_SINGLE);
+ break;
+ }
+ case OP_LFSU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+ FPU_LS_SINGLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_LFD:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+ FPU_LS_DOUBLE);
+ break;
+ }
+ case OP_LFDU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+ FPU_LS_DOUBLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_STFS:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+ FPU_LS_SINGLE);
+ break;
+ }
+ case OP_STFSU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+ FPU_LS_SINGLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_STFD:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+ FPU_LS_DOUBLE);
+ break;
+ }
+ case OP_STFDU:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+ FPU_LS_DOUBLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case 31:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_31_LFSX:
+ {
+ ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+
+ addr += kvmppc_get_gpr(vcpu, ax_rb);
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+ addr, FPU_LS_SINGLE);
+ break;
+ }
+ case OP_31_LFSUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+ addr, FPU_LS_SINGLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_31_LFDX:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+ addr, FPU_LS_DOUBLE);
+ break;
+ }
+ case OP_31_LFDUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+ addr, FPU_LS_DOUBLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_31_STFSX:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+ addr, FPU_LS_SINGLE);
+ break;
+ }
+ case OP_31_STFSUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+ addr, FPU_LS_SINGLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_31_STFX:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+ addr, FPU_LS_DOUBLE);
+ break;
+ }
+ case OP_31_STFUX:
+ {
+ ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+ addr, FPU_LS_DOUBLE);
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, ax_ra, addr);
+ break;
+ }
+ case OP_31_STFIWX:
+ {
+ ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+ kvmppc_get_gpr(vcpu, ax_rb);
+
+ emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+ addr,
+ FPU_LS_SINGLE_LOW);
+ break;
+ }
+ break;
+ }
+ break;
+ case 59:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_59_FADDS:
+ fpd_fadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FSUBS:
+ fpd_fsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FDIVS:
+ fpd_fdivs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FRES:
+ fpd_fres(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FRSQRTES:
+ fpd_frsqrtes(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ }
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_59_FMULS:
+ fpd_fmuls(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FMSUBS:
+ fpd_fmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FMADDS:
+ fpd_fmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FNMSUBS:
+ fpd_fnmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_59_FNMADDS:
+ fpd_fnmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ }
+ break;
+ case 63:
+ switch (inst_get_field(inst, 21, 30)) {
+ case OP_63_MTFSB0:
+ case OP_63_MTFSB1:
+ case OP_63_MCRFS:
+ case OP_63_MTFSFI:
+ /* XXX need to implement */
+ break;
+ case OP_63_MFFS:
+ /* XXX missing CR */
+ *fpr_d = vcpu->arch.fpscr;
+ break;
+ case OP_63_MTFSF:
+ /* XXX missing fm bits */
+ /* XXX missing CR */
+ vcpu->arch.fpscr = *fpr_b;
+ break;
+ case OP_63_FCMPU:
+ {
+ u32 tmp_cr;
+ u32 cr0_mask = 0xf0000000;
+ u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+ fpd_fcmpu(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b);
+ cr &= ~(cr0_mask >> cr_shift);
+ cr |= (cr & cr0_mask) >> cr_shift;
+ break;
+ }
+ case OP_63_FCMPO:
+ {
+ u32 tmp_cr;
+ u32 cr0_mask = 0xf0000000;
+ u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+ fpd_fcmpo(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b);
+ cr &= ~(cr0_mask >> cr_shift);
+ cr |= (cr & cr0_mask) >> cr_shift;
+ break;
+ }
+ case OP_63_FNEG:
+ fpd_fneg(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ break;
+ case OP_63_FMR:
+ *fpr_d = *fpr_b;
+ break;
+ case OP_63_FABS:
+ fpd_fabs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ break;
+ case OP_63_FCPSGN:
+ fpd_fcpsgn(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ break;
+ case OP_63_FDIV:
+ fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ break;
+ case OP_63_FADD:
+ fpd_fadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ break;
+ case OP_63_FSUB:
+ fpd_fsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+ break;
+ case OP_63_FCTIW:
+ fpd_fctiw(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ break;
+ case OP_63_FCTIWZ:
+ fpd_fctiwz(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ break;
+ case OP_63_FRSP:
+ fpd_frsp(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ kvmppc_sync_qpr(vcpu, ax_rd);
+ break;
+ case OP_63_FRSQRTE:
+ {
+ double one = 1.0f;
+
+ /* fD = sqrt(fB) */
+ fpd_fsqrt(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+ /* fD = 1.0f / fD */
+ fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);
+ break;
+ }
+ }
+ switch (inst_get_field(inst, 26, 30)) {
+ case OP_63_FMUL:
+ fpd_fmul(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+ break;
+ case OP_63_FSEL:
+ fpd_fsel(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ break;
+ case OP_63_FMSUB:
+ fpd_fmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ break;
+ case OP_63_FMADD:
+ fpd_fmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ break;
+ case OP_63_FNMSUB:
+ fpd_fnmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ break;
+ case OP_63_FNMADD:
+ fpd_fnmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+ break;
+ }
+ break;
+ }
+
+#ifdef DEBUG
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
+ u32 f;
+ cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+ dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
+ }
+#endif
+
+ if (rcomp)
+ kvmppc_set_cr(vcpu, cr);
+
+ preempt_enable();
+
+ return emulated;
+}
diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index c83c60a..506d5c3 100644
--- a/arch/powerpc/kvm/book3s_64_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -22,7 +22,10 @@
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_PPC_BOOK3S_64
#include <asm/exception-64s.h>
+#endif
/*****************************************************************************
* *
@@ -30,6 +33,39 @@
* *
****************************************************************************/
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#define LOAD_SHADOW_VCPU(reg) \
+ mfspr reg, SPRN_SPRG_PACA
+
+#define SHADOW_VCPU_OFF PACA_KVM_SVCPU
+#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR)
+#define FUNC(name) GLUE(.,name)
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define LOAD_SHADOW_VCPU(reg) \
+ mfspr reg, SPRN_SPRG_THREAD; \
+ lwz reg, THREAD_KVM_SVCPU(reg); \
+ /* PPC32 can have a NULL pointer - let's check for that */ \
+ mtspr SPRN_SPRG_SCRATCH1, r12; /* Save r12 */ \
+ mfcr r12; \
+ cmpwi reg, 0; \
+ bne 1f; \
+ mfspr reg, SPRN_SPRG_SCRATCH0; \
+ mtcr r12; \
+ mfspr r12, SPRN_SPRG_SCRATCH1; \
+ b kvmppc_resume_\intno; \
+1:; \
+ mtcr r12; \
+ mfspr r12, SPRN_SPRG_SCRATCH1; \
+ tophys(reg, reg)
+
+#define SHADOW_VCPU_OFF 0
+#define MSR_NOIRQ MSR_KERNEL
+#define FUNC(name) name
+
+#endif
.macro INTERRUPT_TRAMPOLINE intno
@@ -42,19 +78,19 @@ kvmppc_trampoline_\intno:
* First thing to do is to find out if we're coming
* from a KVM guest or a Linux process.
*
- * To distinguish, we check a magic byte in the PACA
+ * To distinguish, we check a magic byte in the PACA/current
*/
- mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */
- std r12, PACA_KVM_SCRATCH0(r13)
+ LOAD_SHADOW_VCPU(r13)
+ PPC_STL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
mfcr r12
- stw r12, PACA_KVM_SCRATCH1(r13)
- lbz r12, PACA_KVM_IN_GUEST(r13)
+ stw r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+ lbz r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
cmpwi r12, KVM_GUEST_MODE_NONE
bne ..kvmppc_handler_hasmagic_\intno
/* No KVM guest? Then jump back to the Linux handler! */
- lwz r12, PACA_KVM_SCRATCH1(r13)
+ lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
mtcr r12
- ld r12, PACA_KVM_SCRATCH0(r13)
+ PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */
b kvmppc_resume_\intno /* Get back original handler */
@@ -76,9 +112,7 @@ kvmppc_trampoline_\intno:
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSTEM_RESET
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE
-INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE
-INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM
@@ -88,7 +122,14 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSCALL
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC
+
+/* Those are only available on 64 bit machines */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT
+INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
+#endif
/*
* Bring us back to the faulting code, but skip the
@@ -99,11 +140,11 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
*
* Input Registers:
*
- * R12 = free
- * R13 = PACA
- * PACA.KVM.SCRATCH0 = guest R12
- * PACA.KVM.SCRATCH1 = guest CR
- * SPRG_SCRATCH0 = guest R13
+ * R12 = free
+ * R13 = Shadow VCPU (PACA)
+ * SVCPU.SCRATCH0 = guest R12
+ * SVCPU.SCRATCH1 = guest CR
+ * SPRG_SCRATCH0 = guest R13
*
*/
kvmppc_handler_skip_ins:
@@ -114,9 +155,9 @@ kvmppc_handler_skip_ins:
mtsrr0 r12
/* Clean up all state */
- lwz r12, PACA_KVM_SCRATCH1(r13)
+ lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
mtcr r12
- ld r12, PACA_KVM_SCRATCH0(r13)
+ PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
mfspr r13, SPRN_SPRG_SCRATCH0
/* And get back into the code */
@@ -147,41 +188,48 @@ kvmppc_handler_lowmem_trampoline_end:
*
* R3 = function
* R4 = MSR
- * R5 = CTR
+ * R5 = scratch register
*
*/
_GLOBAL(kvmppc_rmcall)
- mtmsr r4 /* Disable relocation, so mtsrr
+ LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ)
+ mtmsr r5 /* Disable relocation and interrupts, so mtsrr
doesn't get interrupted */
- mtctr r5
+ sync
mtsrr0 r3
mtsrr1 r4
RFI
+#if defined(CONFIG_PPC_BOOK3S_32)
+#define STACK_LR INT_FRAME_SIZE+4
+#elif defined(CONFIG_PPC_BOOK3S_64)
+#define STACK_LR _LINK
+#endif
+
/*
* Activate current's external feature (FPU/Altivec/VSX)
*/
-#define define_load_up(what) \
- \
-_GLOBAL(kvmppc_load_up_ ## what); \
- subi r1, r1, INT_FRAME_SIZE; \
- mflr r3; \
- std r3, _LINK(r1); \
- mfmsr r4; \
- std r31, GPR3(r1); \
- mr r31, r4; \
- li r5, MSR_DR; \
- oris r5, r5, MSR_EE@h; \
- andc r4, r4, r5; \
- mtmsr r4; \
- \
- bl .load_up_ ## what; \
- \
- mtmsr r31; \
- ld r3, _LINK(r1); \
- ld r31, GPR3(r1); \
- addi r1, r1, INT_FRAME_SIZE; \
- mtlr r3; \
+#define define_load_up(what) \
+ \
+_GLOBAL(kvmppc_load_up_ ## what); \
+ PPC_STLU r1, -INT_FRAME_SIZE(r1); \
+ mflr r3; \
+ PPC_STL r3, STACK_LR(r1); \
+ PPC_STL r20, _NIP(r1); \
+ mfmsr r20; \
+ LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \
+ andc r3,r20,r3; /* Disable DR,EE */ \
+ mtmsr r3; \
+ sync; \
+ \
+ bl FUNC(load_up_ ## what); \
+ \
+ mtmsr r20; /* Enable DR,EE */ \
+ sync; \
+ PPC_LL r3, STACK_LR(r1); \
+ PPC_LL r20, _NIP(r1); \
+ mtlr r3; \
+ addi r1, r1, INT_FRAME_SIZE; \
blr
define_load_up(fpu)
@@ -194,11 +242,10 @@ define_load_up(vsx)
.global kvmppc_trampoline_lowmem
kvmppc_trampoline_lowmem:
- .long kvmppc_handler_lowmem_trampoline - _stext
+ .long kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
.global kvmppc_trampoline_enter
kvmppc_trampoline_enter:
- .long kvmppc_handler_trampoline_enter - _stext
-
-#include "book3s_64_slb.S"
+ .long kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
+#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
new file mode 100644
index 0000000..7c52ed0
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -0,0 +1,259 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/* Real mode helpers */
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#define GET_SHADOW_VCPU(reg) \
+ addi reg, r13, PACA_KVM_SVCPU
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define GET_SHADOW_VCPU(reg) \
+ tophys(reg, r2); \
+ lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \
+ tophys(reg, reg)
+
+#endif
+
+/* Disable for nested KVM */
+#define USE_QUICK_LAST_INST
+
+
+/* Get helper functions for subarch specific functionality */
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+#include "book3s_64_slb.S"
+#elif defined(CONFIG_PPC_BOOK3S_32)
+#include "book3s_32_sr.S"
+#endif
+
+/******************************************************************************
+ * *
+ * Entry code *
+ * *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+ /* Required state:
+ *
+ * MSR = ~IR|DR
+ * R13 = PACA
+ * R1 = host R1
+ * R2 = host R2
+ * R10 = guest MSR
+ * all other volatile GPRS = free
+ * SVCPU[CR] = guest CR
+ * SVCPU[XER] = guest XER
+ * SVCPU[CTR] = guest CTR
+ * SVCPU[LR] = guest LR
+ */
+
+ /* r3 = shadow vcpu */
+ GET_SHADOW_VCPU(r3)
+
+ /* Move SRR0 and SRR1 into the respective regs */
+ PPC_LL r9, SVCPU_PC(r3)
+ mtsrr0 r9
+ mtsrr1 r10
+
+ /* Activate guest mode, so faults get handled by KVM */
+ li r11, KVM_GUEST_MODE_GUEST
+ stb r11, SVCPU_IN_GUEST(r3)
+
+ /* Switch to guest segment. This is subarch specific. */
+ LOAD_GUEST_SEGMENTS
+
+ /* Enter guest */
+
+ PPC_LL r4, (SVCPU_CTR)(r3)
+ PPC_LL r5, (SVCPU_LR)(r3)
+ lwz r6, (SVCPU_CR)(r3)
+ lwz r7, (SVCPU_XER)(r3)
+
+ mtctr r4
+ mtlr r5
+ mtcr r6
+ mtxer r7
+
+ PPC_LL r0, (SVCPU_R0)(r3)
+ PPC_LL r1, (SVCPU_R1)(r3)
+ PPC_LL r2, (SVCPU_R2)(r3)
+ PPC_LL r4, (SVCPU_R4)(r3)
+ PPC_LL r5, (SVCPU_R5)(r3)
+ PPC_LL r6, (SVCPU_R6)(r3)
+ PPC_LL r7, (SVCPU_R7)(r3)
+ PPC_LL r8, (SVCPU_R8)(r3)
+ PPC_LL r9, (SVCPU_R9)(r3)
+ PPC_LL r10, (SVCPU_R10)(r3)
+ PPC_LL r11, (SVCPU_R11)(r3)
+ PPC_LL r12, (SVCPU_R12)(r3)
+ PPC_LL r13, (SVCPU_R13)(r3)
+
+ PPC_LL r3, (SVCPU_R3)(r3)
+
+ RFI
+kvmppc_handler_trampoline_enter_end:
+
+
+
+/******************************************************************************
+ * *
+ * Exit code *
+ * *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
+
+ /* Register usage at this point:
+ *
+ * SPRG_SCRATCH0 = guest R13
+ * R12 = exit handler id
+ * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+ * SVCPU.SCRATCH0 = guest R12
+ * SVCPU.SCRATCH1 = guest CR
+ *
+ */
+
+ /* Save registers */
+
+ PPC_STL r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13)
+ PPC_STL r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13)
+ PPC_STL r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13)
+ PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13)
+ PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13)
+ PPC_STL r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13)
+ PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13)
+ PPC_STL r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13)
+ PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13)
+ PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13)
+ PPC_STL r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13)
+ PPC_STL r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13)
+
+ /* Restore R1/R2 so we can handle faults */
+ PPC_LL r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
+ PPC_LL r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+
+ /* Save guest PC and MSR */
+ mfsrr0 r3
+ mfsrr1 r4
+
+ PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13)
+ PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13)
+
+ /* Get scratch'ed off registers */
+ mfspr r9, SPRN_SPRG_SCRATCH0
+ PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+ lwz r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+
+ PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13)
+ PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13)
+ stw r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13)
+
+ /* Save more register state */
+
+ mfxer r5
+ mfdar r6
+ mfdsisr r7
+ mfctr r8
+ mflr r9
+
+ stw r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13)
+ PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13)
+ stw r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13)
+ PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13)
+ PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13)
+
+ /*
+ * In order for us to easily get the last instruction,
+ * we got the #vmexit at, we exploit the fact that the
+ * virtual layout is still the same here, so we can just
+ * ld from the guest's PC address
+ */
+
+ /* We only load the last instruction when it's safe */
+ cmpwi r12, BOOK3S_INTERRUPT_DATA_STORAGE
+ beq ld_last_inst
+ cmpwi r12, BOOK3S_INTERRUPT_PROGRAM
+ beq ld_last_inst
+ cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT
+ beq- ld_last_inst
+
+ b no_ld_last_inst
+
+ld_last_inst:
+ /* Save off the guest instruction we're at */
+
+ /* In case lwz faults */
+ li r0, KVM_INST_FETCH_FAILED
+
+#ifdef USE_QUICK_LAST_INST
+
+ /* Set guest mode to 'jump over instruction' so if lwz faults
+ * we'll just continue at the next IP. */
+ li r9, KVM_GUEST_MODE_SKIP
+ stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+
+ /* 1) enable paging for data */
+ mfmsr r9
+ ori r11, r9, MSR_DR /* Enable paging for data */
+ mtmsr r11
+ sync
+ /* 2) fetch the instruction */
+ lwz r0, 0(r3)
+ /* 3) disable paging again */
+ mtmsr r9
+ sync
+
+#endif
+ stw r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13)
+
+no_ld_last_inst:
+
+ /* Unset guest mode */
+ li r9, KVM_GUEST_MODE_NONE
+ stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+
+ /* Switch back to host MMU */
+ LOAD_HOST_SEGMENTS
+
+ /* Register usage at this point:
+ *
+ * R1 = host R1
+ * R2 = host R2
+ * R12 = exit handler id
+ * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+ * SVCPU.* = guest *
+ *
+ */
+
+ /* RFI into the highmem handler */
+ mfmsr r7
+ ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */
+ mtsrr1 r7
+ /* Load highmem handler address */
+ PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13)
+ mtsrr0 r8
+
+ RFI
+kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 2a3a195..a33ab8c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -133,6 +133,12 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
}
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+}
+
/* Deliver the interrupt of the corresponding priority, if possible. */
static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
unsigned int priority)
@@ -479,6 +485,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
+ vcpu_load(vcpu);
+
regs->pc = vcpu->arch.pc;
regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
@@ -499,6 +507,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+ vcpu_put(vcpu);
+
return 0;
}
@@ -506,6 +516,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
+ vcpu_load(vcpu);
+
vcpu->arch.pc = regs->pc;
kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
@@ -525,6 +537,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+ vcpu_put(vcpu);
+
return 0;
}
@@ -553,7 +567,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
struct kvm_translation *tr)
{
- return kvmppc_core_vcpu_translate(vcpu, tr);
+ int r;
+
+ vcpu_load(vcpu);
+ r = kvmppc_core_vcpu_translate(vcpu, tr);
+ vcpu_put(vcpu);
+ return r;
}
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 669a5c5..bc2b400 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -161,7 +161,7 @@ static int __init kvmppc_e500_init(void)
flush_icache_range(kvmppc_booke_handlers,
kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
- return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
+ return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
}
static void __init kvmppc_e500_exit(void)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index cb72a65..4568ec3 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -38,10 +38,12 @@
#define OP_31_XOP_LBZX 87
#define OP_31_XOP_STWX 151
#define OP_31_XOP_STBX 215
+#define OP_31_XOP_LBZUX 119
#define OP_31_XOP_STBUX 247
#define OP_31_XOP_LHZX 279
#define OP_31_XOP_LHZUX 311
#define OP_31_XOP_MFSPR 339
+#define OP_31_XOP_LHAX 343
#define OP_31_XOP_STHX 407
#define OP_31_XOP_STHUX 439
#define OP_31_XOP_MTSPR 467
@@ -62,10 +64,12 @@
#define OP_STBU 39
#define OP_LHZ 40
#define OP_LHZU 41
+#define OP_LHA 42
+#define OP_LHAU 43
#define OP_STH 44
#define OP_STHU 45
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
{
return 1;
@@ -82,7 +86,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
unsigned long dec_nsec;
pr_debug("mtDEC: %x\n", vcpu->arch.dec);
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
/* mtdec lowers the interrupt line when positive. */
kvmppc_core_dequeue_dec(vcpu);
@@ -128,7 +132,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
* from opcode tables in the future. */
int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
- u32 inst = vcpu->arch.last_inst;
+ u32 inst = kvmppc_get_last_inst(vcpu);
u32 ea;
int ra;
int rb;
@@ -143,13 +147,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
- /* Try again next time */
- if (inst == KVM_INST_FETCH_FAILED)
- return EMULATE_DONE;
-
switch (get_op(inst)) {
case OP_TRAP:
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
case OP_TRAP_64:
kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
#else
@@ -171,6 +171,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
break;
+ case OP_31_XOP_LBZUX:
+ rt = get_rt(inst);
+ ra = get_ra(inst);
+ rb = get_rb(inst);
+
+ ea = kvmppc_get_gpr(vcpu, rb);
+ if (ra)
+ ea += kvmppc_get_gpr(vcpu, ra);
+
+ emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+ kvmppc_set_gpr(vcpu, ra, ea);
+ break;
+
case OP_31_XOP_STWX:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
@@ -200,6 +213,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvmppc_set_gpr(vcpu, rs, ea);
break;
+ case OP_31_XOP_LHAX:
+ rt = get_rt(inst);
+ emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+ break;
+
case OP_31_XOP_LHZX:
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
@@ -450,6 +468,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
+ case OP_LHA:
+ rt = get_rt(inst);
+ emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+ break;
+
+ case OP_LHAU:
+ ra = get_ra(inst);
+ rt = get_rt(inst);
+ emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+ break;
+
case OP_STH:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
@@ -472,7 +502,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
if (emulated == EMULATE_FAIL) {
emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance);
- if (emulated == EMULATE_FAIL) {
+ if (emulated == EMULATE_AGAIN) {
+ advance = 0;
+ } else if (emulated == EMULATE_FAIL) {
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
@@ -480,10 +512,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
}
- trace_kvm_ppc_instr(inst, vcpu->arch.pc, emulated);
+ trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated);
+ /* Advance past emulated instruction. */
if (advance)
- vcpu->arch.pc += 4; /* Advance past emulated instruction. */
+ kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
return emulated;
}
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
new file mode 100644
index 0000000..2b340a3
--- /dev/null
+++ b/arch/powerpc/kvm/fpu.S
@@ -0,0 +1,273 @@
+/*
+ * FPU helper code to use FPU operations from inside the kernel
+ *
+ * Copyright (C) 2010 Alexander Graf (agraf@suse.de)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/* Instructions operating on single parameters */
+
+/*
+ * Single operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ */
+#define FPS_ONE_IN(name) \
+_GLOBAL(fps_ ## name); \
+ lfd 0,0(r3); /* load up fpscr value */ \
+ MTFSF_L(0); \
+ lfs 0,0(r5); \
+ \
+ name 0,0; \
+ \
+ stfs 0,0(r4); \
+ mffs 0; \
+ stfd 0,0(r3); /* save new fpscr value */ \
+ blr
+
+/*
+ * Single operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ */
+#define FPS_TWO_IN(name) \
+_GLOBAL(fps_ ## name); \
+ lfd 0,0(r3); /* load up fpscr value */ \
+ MTFSF_L(0); \
+ lfs 0,0(r5); \
+ lfs 1,0(r6); \
+ \
+ name 0,0,1; \
+ \
+ stfs 0,0(r4); \
+ mffs 0; \
+ stfd 0,0(r3); /* save new fpscr value */ \
+ blr
+
+/*
+ * Single operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ * R7 = (short*)&param3
+ */
+#define FPS_THREE_IN(name) \
+_GLOBAL(fps_ ## name); \
+ lfd 0,0(r3); /* load up fpscr value */ \
+ MTFSF_L(0); \
+ lfs 0,0(r5); \
+ lfs 1,0(r6); \
+ lfs 2,0(r7); \
+ \
+ name 0,0,1,2; \
+ \
+ stfs 0,0(r4); \
+ mffs 0; \
+ stfd 0,0(r3); /* save new fpscr value */ \
+ blr
+
+FPS_ONE_IN(fres)
+FPS_ONE_IN(frsqrte)
+FPS_ONE_IN(fsqrts)
+FPS_TWO_IN(fadds)
+FPS_TWO_IN(fdivs)
+FPS_TWO_IN(fmuls)
+FPS_TWO_IN(fsubs)
+FPS_THREE_IN(fmadds)
+FPS_THREE_IN(fmsubs)
+FPS_THREE_IN(fnmadds)
+FPS_THREE_IN(fnmsubs)
+FPS_THREE_IN(fsel)
+
+
+/* Instructions operating on double parameters */
+
+/*
+ * Beginning of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2 [load_two]
+ * R8 = (double*)&param3 [load_three]
+ * LR = instruction call function
+ */
+fpd_load_three:
+ lfd 2,0(r8) /* load param3 */
+fpd_load_two:
+ lfd 1,0(r7) /* load param2 */
+fpd_load_one:
+ lfd 0,0(r6) /* load param1 */
+fpd_load_none:
+ lfd 3,0(r3) /* load up fpscr value */
+ MTFSF_L(3)
+ lwz r6, 0(r4) /* load cr */
+ mtcr r6
+ blr
+
+/*
+ * End of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * LR = caller of instruction call function
+ */
+fpd_return:
+ mfcr r6
+ stfd 0,0(r5) /* save result */
+ mffs 0
+ stfd 0,0(r3) /* save new fpscr value */
+ stw r6,0(r4) /* save new cr value */
+ blr
+
+/*
+ * Double operation with no input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ */
+#define FPD_NONE_IN(name) \
+_GLOBAL(fpd_ ## name); \
+ mflr r12; \
+ bl fpd_load_none; \
+ mtlr r12; \
+ \
+ name. 0; /* call instruction */ \
+ b fpd_return
+
+/*
+ * Double operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ */
+#define FPD_ONE_IN(name) \
+_GLOBAL(fpd_ ## name); \
+ mflr r12; \
+ bl fpd_load_one; \
+ mtlr r12; \
+ \
+ name. 0,0; /* call instruction */ \
+ b fpd_return
+
+/*
+ * Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_TWO_IN(name) \
+_GLOBAL(fpd_ ## name); \
+ mflr r12; \
+ bl fpd_load_two; \
+ mtlr r12; \
+ \
+ name. 0,0,1; /* call instruction */ \
+ b fpd_return
+
+/*
+ * CR Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&param1
+ * R6 = (double*)&param2
+ * R7 = (double*)&param3
+ */
+#define FPD_TWO_IN_CR(name) \
+_GLOBAL(fpd_ ## name); \
+ lfd 1,0(r6); /* load param2 */ \
+ lfd 0,0(r5); /* load param1 */ \
+ lfd 3,0(r3); /* load up fpscr value */ \
+ MTFSF_L(3); \
+ lwz r6, 0(r4); /* load cr */ \
+ mtcr r6; \
+ \
+ name 0,0,1; /* call instruction */ \
+ mfcr r6; \
+ mffs 0; \
+ stfd 0,0(r3); /* save new fpscr value */ \
+ stw r6,0(r4); /* save new cr value */ \
+ blr
+
+/*
+ * Double operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_THREE_IN(name) \
+_GLOBAL(fpd_ ## name); \
+ mflr r12; \
+ bl fpd_load_three; \
+ mtlr r12; \
+ \
+ name. 0,0,1,2; /* call instruction */ \
+ b fpd_return
+
+FPD_ONE_IN(fsqrts)
+FPD_ONE_IN(frsqrtes)
+FPD_ONE_IN(fres)
+FPD_ONE_IN(frsp)
+FPD_ONE_IN(fctiw)
+FPD_ONE_IN(fctiwz)
+FPD_ONE_IN(fsqrt)
+FPD_ONE_IN(fre)
+FPD_ONE_IN(frsqrte)
+FPD_ONE_IN(fneg)
+FPD_ONE_IN(fabs)
+FPD_TWO_IN(fadds)
+FPD_TWO_IN(fsubs)
+FPD_TWO_IN(fdivs)
+FPD_TWO_IN(fmuls)
+FPD_TWO_IN_CR(fcmpu)
+FPD_TWO_IN(fcpsgn)
+FPD_TWO_IN(fdiv)
+FPD_TWO_IN(fadd)
+FPD_TWO_IN(fmul)
+FPD_TWO_IN_CR(fcmpo)
+FPD_TWO_IN(fsub)
+FPD_THREE_IN(fmsubs)
+FPD_THREE_IN(fmadds)
+FPD_THREE_IN(fnmsubs)
+FPD_THREE_IN(fnmadds)
+FPD_THREE_IN(fsel)
+FPD_THREE_IN(fmsub)
+FPD_THREE_IN(fmadd)
+FPD_THREE_IN(fnmsub)
+FPD_THREE_IN(fnmadd)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 297fcd2..9b8683f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -70,7 +70,7 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
case EMULATE_FAIL:
/* XXX Deliver Program interrupt to guest. */
printk(KERN_EMERG "%s: emulation failed (%08x)\n", __func__,
- vcpu->arch.last_inst);
+ kvmppc_get_last_inst(vcpu));
r = RESUME_HOST;
break;
default:
@@ -148,6 +148,10 @@ int kvm_dev_ioctl_check_extension(long ext)
switch (ext) {
case KVM_CAP_PPC_SEGSTATE:
+ case KVM_CAP_PPC_PAIRED_SINGLES:
+ case KVM_CAP_PPC_UNSET_IRQ:
+ case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_PPC_OSI:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -193,12 +197,17 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
{
struct kvm_vcpu *vcpu;
vcpu = kvmppc_core_vcpu_create(kvm, id);
- kvmppc_create_vcpu_debugfs(vcpu, id);
+ if (!IS_ERR(vcpu))
+ kvmppc_create_vcpu_debugfs(vcpu, id);
return vcpu;
}
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
+ /* Make sure we're not using the vcpu anymore */
+ hrtimer_cancel(&vcpu->arch.dec_timer);
+ tasklet_kill(&vcpu->arch.tasklet);
+
kvmppc_remove_vcpu_debugfs(vcpu);
kvmppc_core_vcpu_free(vcpu);
}
@@ -278,7 +287,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
- ulong gpr;
+ u64 gpr;
if (run->mmio.len > sizeof(gpr)) {
printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -287,6 +296,7 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
if (vcpu->arch.mmio_is_bigendian) {
switch (run->mmio.len) {
+ case 8: gpr = *(u64 *)run->mmio.data; break;
case 4: gpr = *(u32 *)run->mmio.data; break;
case 2: gpr = *(u16 *)run->mmio.data; break;
case 1: gpr = *(u8 *)run->mmio.data; break;
@@ -300,7 +310,43 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
}
}
+ if (vcpu->arch.mmio_sign_extend) {
+ switch (run->mmio.len) {
+#ifdef CONFIG_PPC64
+ case 4:
+ gpr = (s64)(s32)gpr;
+ break;
+#endif
+ case 2:
+ gpr = (s64)(s16)gpr;
+ break;
+ case 1:
+ gpr = (s64)(s8)gpr;
+ break;
+ }
+ }
+
kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
+
+ switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) {
+ case KVM_REG_GPR:
+ kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
+ break;
+ case KVM_REG_FPR:
+ vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+ break;
+#ifdef CONFIG_PPC_BOOK3S
+ case KVM_REG_QPR:
+ vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+ break;
+ case KVM_REG_FQPR:
+ vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+ vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+ break;
+#endif
+ default:
+ BUG();
+ }
}
int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -319,12 +365,25 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
vcpu->arch.mmio_is_bigendian = is_bigendian;
vcpu->mmio_needed = 1;
vcpu->mmio_is_write = 0;
+ vcpu->arch.mmio_sign_extend = 0;
return EMULATE_DO_MMIO;
}
+/* Same as above, but sign extends */
+int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int rt, unsigned int bytes, int is_bigendian)
+{
+ int r;
+
+ r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
+ vcpu->arch.mmio_sign_extend = 1;
+
+ return r;
+}
+
int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
- u32 val, unsigned int bytes, int is_bigendian)
+ u64 val, unsigned int bytes, int is_bigendian)
{
void *data = run->mmio.data;
@@ -342,6 +401,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Store the value at the lowest bytes in 'data'. */
if (is_bigendian) {
switch (bytes) {
+ case 8: *(u64 *)data = val; break;
case 4: *(u32 *)data = val; break;
case 2: *(u16 *)data = val; break;
case 1: *(u8 *)data = val; break;
@@ -376,6 +436,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
if (!vcpu->arch.dcr_is_write)
kvmppc_complete_dcr_load(vcpu, run);
vcpu->arch.dcr_needed = 0;
+ } else if (vcpu->arch.osi_needed) {
+ u64 *gprs = run->osi.gprs;
+ int i;
+
+ for (i = 0; i < 32; i++)
+ kvmppc_set_gpr(vcpu, i, gprs[i]);
+ vcpu->arch.osi_needed = 0;
}
kvmppc_core_deliver_interrupts(vcpu);
@@ -396,7 +463,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
{
- kvmppc_core_queue_external(vcpu, irq);
+ if (irq->irq == KVM_INTERRUPT_UNSET)
+ kvmppc_core_dequeue_external(vcpu, irq);
+ else
+ kvmppc_core_queue_external(vcpu, irq);
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
@@ -406,6 +476,27 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
return 0;
}
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_PPC_OSI:
+ r = 0;
+ vcpu->arch.osi_enabled = true;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
@@ -434,6 +525,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
break;
}
+ case KVM_ENABLE_CAP:
+ {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
default:
r = -EINVAL;
}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
index 0dfba2b..d0ee554 100644
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -60,11 +60,7 @@
static unsigned long next_mmu_context;
static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-
-/*
- * Set up the context for a new address space.
- */
-int init_new_context(struct task_struct *t, struct mm_struct *mm)
+unsigned long __init_new_context(void)
{
unsigned long ctx = next_mmu_context;
@@ -74,19 +70,38 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
ctx = 0;
}
next_mmu_context = (ctx + 1) & LAST_CONTEXT;
- mm->context.id = ctx;
+
+ return ctx;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+ mm->context.id = __init_new_context();
return 0;
}
/*
+ * Free a context ID. Make sure to call this with preempt disabled!
+ */
+void __destroy_context(unsigned long ctx)
+{
+ clear_bit(ctx, context_map);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+/*
* We're finished using the context for an address space.
*/
void destroy_context(struct mm_struct *mm)
{
preempt_disable();
if (mm->context.id != NO_CONTEXT) {
- clear_bit(mm->context.id, context_map);
+ __destroy_context(mm->context.id);
mm->context.id = NO_CONTEXT;
}
preempt_enable();
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4929286..8093e6f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -341,11 +341,13 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
rc = kvm_vcpu_init(vcpu, kvm, id);
if (rc)
- goto out_free_cpu;
+ goto out_free_sie_block;
VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
vcpu->arch.sie_block);
return vcpu;
+out_free_sie_block:
+ free_page((unsigned long)(vcpu->arch.sie_block));
out_free_cpu:
kfree(vcpu);
out_nomem:
@@ -750,7 +752,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
static int __init kvm_s390_init(void)
{
int ret;
- ret = kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+ ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
if (ret)
return ret;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 60f09ab..cfa9d17 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -72,7 +72,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
struct kvm_memslots *memslots;
idx = srcu_read_lock(&vcpu->kvm->srcu);
- memslots = rcu_dereference(vcpu->kvm->memslots);
+ memslots = kvm_memslots(vcpu->kvm);
mem = &memslots->memslots[0];
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f..ff90055 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
#define __KVM_HAVE_PIT_STATE2
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 pad;
+ __u8 shadow;
} interrupt;
struct {
__u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
__u32 reserved[10];
};
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54f..0b2729b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
#ifndef _ASM_X86_KVM_X86_EMULATE_H
#define _ASM_X86_KVM_X86_EMULATE_H
+#include <asm/desc_defs.h>
+
struct x86_emulate_ctxt;
/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ /*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
* @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
unsigned int bytes,
struct kvm_vcpu *vcpu);
+ int (*pio_in_emulated)(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ bool (*get_cached_descriptor)(struct desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ void (*set_cached_descriptor)(struct desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+ void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+ void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+ ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+ void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+ int (*cpl)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
};
/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
unsigned long end;
};
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
struct decode_cache {
u8 twobyte;
u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
unsigned long regs[NR_VCPU_REGS];
- unsigned long eip, eip_orig;
+ unsigned long eip;
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
void *modrm_ptr;
unsigned long modrm_val;
struct fetch_cache fetch;
+ struct read_cache io_read;
};
-#define X86_SHADOW_INT_MOV_SS 1
-#define X86_SHADOW_INT_STI 2
-
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ bool restart; /* restart string instruction after writeback */
/* decode cache */
struct decode_cache decode;
};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79..76f5483 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,15 +171,15 @@ struct kvm_pte_chain {
union kvm_mmu_page_role {
unsigned word;
struct {
- unsigned glevels:4;
unsigned level:4;
+ unsigned cr4_pae:1;
unsigned quadrant:2;
unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
- unsigned cr4_pge:1;
unsigned nxe:1;
+ unsigned cr0_wp:1;
};
};
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
- struct list_head oos_link;
-
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
+ bool multimapped; /* More than one parent_pte? */
bool unsync;
+ int root_count; /* Currently serving as active root */
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
struct kvm_pio_request {
unsigned long count;
- int cur_count;
- gva_t guest_gva;
int in;
int port;
int size;
- int string;
- int down;
- int rep;
};
/*
@@ -320,6 +313,7 @@ struct kvm_vcpu_arch {
struct kvm_queued_exception {
bool pending;
bool has_error_code;
+ bool reinject;
u8 nr;
u32 error_code;
} exception;
@@ -362,8 +356,8 @@ struct kvm_vcpu_arch {
u64 *mce_banks;
/* used for guest single stepping over the given code position */
- u16 singlestep_cs;
unsigned long singlestep_rip;
+
/* fields used by HYPER-V emulation */
u64 hv_vapic;
};
@@ -389,6 +383,7 @@ struct kvm_arch {
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
+ atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
@@ -461,11 +456,6 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
-struct descriptor_table {
- u16 limit;
- unsigned long base;
-} __attribute__((packed));
-
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +493,11 @@ struct kvm_x86_ops {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
- int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -527,7 +516,8 @@ struct kvm_x86_ops {
void (*set_irq)(struct kvm_vcpu *vcpu);
void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code);
+ bool has_error_code, u32 error_code,
+ bool reinject);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -541,6 +531,8 @@ struct kvm_x86_ops {
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
+ void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags);
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
- unsigned long *rflags);
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
struct x86_emulate_ctxt;
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
- int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port);
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
u32 error_code);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
@@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu);
-unsigned long segment_base(u16 selector);
-
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
@@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
void kvm_disable_tdp(void);
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
@@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel)
asm("lldt %0" : : "rm"(sel));
}
-static inline void kvm_get_idt(struct descriptor_table *table)
-{
- asm("sidt %0" : "=m"(*table));
-}
-
-static inline void kvm_get_gdt(struct descriptor_table *table)
-{
- asm("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
#ifdef CONFIG_X86_64
static inline unsigned long read_msr(unsigned long msr)
{
@@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_define_shared_msr(unsigned index, u32 msr);
void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae142..05eba5e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,10 +16,23 @@
#define KVM_FEATURE_CLOCKSOURCE 0
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_OP 2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2 3
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+
#define KVM_MAX_MMU_OP_BATCH 32
/* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473ac..f932485 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -202,8 +202,9 @@
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
-#define FEATURE_CONTROL_LOCKED (1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
+#define FEATURE_CONTROL_LOCKED (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508..35f2d19 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
u64 system_time;
u32 tsc_to_system_mul;
s8 tsc_shift;
- u8 pad[3];
+ u8 flags;
+ u8 pad[2];
} __attribute__((__packed__)); /* 32 bytes */
struct pvclock_wall_clock {
@@ -38,5 +39,6 @@ struct pvclock_wall_clock {
u32 nsec;
} __attribute__((__packed__));
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd..cd02f32 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd..0e83105 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u8 reserved_5[832];
+ u64 reserved_5;
+ u64 next_rip;
+ u8 reserved_6[816];
};
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
struct __attribute__ ((__packed__)) vmcb_seg {
u16 selector;
u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080..9e6779f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
*
*/
+#include <linux/types.h>
+
/*
* Definitions of Primary Processor-Based VM-Execution Controls.
*/
@@ -120,6 +122,8 @@ enum vmcs_field {
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
GUEST_IA32_PAT = 0x00002804,
GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -130,6 +134,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH = 0x00002811,
HOST_IA32_PAT = 0x00002c00,
HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -394,6 +400,10 @@ enum vmcs_field {
#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
+struct vmx_msr_entry {
+ u32 index;
+ u32 reserved;
+ u64 value;
+} __aligned(16);
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d..eb9b76c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
#define KVM_SCALE 22
static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static int parse_no_kvmclock(char *arg)
{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
low = (int)__pa_symbol(&wall_clock);
high = ((u64)__pa_symbol(&wall_clock) >> 32);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+ native_write_msr(msr_kvm_wall_clock, low, high);
vcpu_time = &get_cpu_var(hv_clock);
pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
- return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+ return native_write_msr_safe(msr_kvm_system_time, low, high);
}
#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
#ifdef CONFIG_KEXEC
static void kvm_crash_shutdown(struct pt_regs *regs)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_write_msr(msr_kvm_system_time, 0, 0);
native_machine_crash_shutdown(regs);
}
#endif
static void kvm_shutdown(void)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_write_msr(msr_kvm_system_time, 0, 0);
native_machine_shutdown();
}
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
if (!kvm_para_available())
return;
- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock("boot clock"))
- return;
- pv_time_ops.sched_clock = kvm_clock_read;
- x86_platform.calibrate_tsc = kvm_get_tsc_khz;
- x86_platform.get_wallclock = kvm_get_wallclock;
- x86_platform.set_wallclock = kvm_set_wallclock;
+ if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+ return;
+
+ printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
+ if (kvm_register_clock("boot clock"))
+ return;
+ pv_time_ops.sched_clock = kvm_clock_read;
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
- kvm_setup_secondary_clock;
+ x86_cpuinit.setup_percpu_clockev =
+ kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
- machine_ops.shutdown = kvm_shutdown;
+ machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
- machine_ops.crash_shutdown = kvm_crash_shutdown;
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
- kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
- pv_info.paravirt_enabled = 1;
- pv_info.name = "KVM";
- }
+ kvm_get_preset_lpj();
+ clocksource_register(&kvm_clock);
+ pv_info.paravirt_enabled = 1;
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2..239427c 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
+ u8 flags;
};
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+ valid_flags = flags;
+}
+
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
+ dst->flags = src->flags;
rmb(); /* test version after fetching data */
} while ((src->version & 1) || (dst->version != src->version));
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
unsigned version;
cycle_t ret, offset;
+ u64 last;
do {
version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
barrier();
} while (version != src->version);
+ if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error marging we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = atomic64_read(&last_value);
+ do {
+ if (ret < last)
+ return last;
+ last = atomic64_cmpxchg(&last_value, last, ret);
+ } while (unlikely(last != ret));
+
return ret;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index cc2c604..c2f1b26 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
#define AP_WAIT_TIMEOUT 1
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6a..5ac0bb4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
#include <asm/kvm_emulate.h>
#include "x86.h"
+#include "tss.h"
/*
* Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstDI (5<<1) /* Destination is in ES:(E)DI */
+#define DstMem64 (6<<1) /* 64bit memory operand */
#define DstMask (7<<1)
/* Source operand type. */
#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
+#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -85,6 +89,9 @@
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
#define Src2Imm16 (4<<29)
+#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
+ in memory and second argument is located
+ immediately after the first one in memory. */
#define Src2Mask (7<<29)
enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
0, 0, 0, 0,
/* 0x68 - 0x6F */
SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
+ DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
+ SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
/* 0x70 - 0x77 */
SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
+ ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
+ ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
/* 0xA8 - 0xAF */
- 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
+ 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+ ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
+ ByteOp | DstDI | String, DstDI | String,
/* 0xB0 - 0xB7 */
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
0, 0, 0, 0,
- ByteOp | SrcImmUByte, SrcImmUByte,
- ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+ ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+ SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
[Group5*8] =
DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+ SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
[Group9*8] =
- 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+ 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
};
static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, 0,
+ SrcMem16 | ModRM | Mov | Priv, 0,
[Group9*8] =
0, 0, 0, 0, 0, 0, 0, 0,
};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != 0) \
+ if (rc != X86EMUL_CONTINUE) \
goto done; \
(_eip) += (_size); \
(_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- unsigned long linear, u8 *dest)
+ unsigned long eip, u8 *dest)
{
struct fetch_cache *fc = &ctxt->decode.fetch;
int rc;
- int size;
+ int size, cur_size;
- if (linear < fc->start || linear >= fc->end) {
- size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
- if (rc)
+ if (eip == fc->end) {
+ cur_size = fc->end - fc->start;
+ size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+ rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+ size, ctxt->vcpu, NULL);
+ if (rc != X86EMUL_CONTINUE)
return rc;
- fc->start = linear;
- fc->end = linear + size;
+ fc->end += size;
}
- *dest = fc->data[linear - fc->start];
- return 0;
+ *dest = fc->data[eip - fc->start];
+ return X86EMUL_CONTINUE;
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
- int rc = 0;
+ int rc;
/* x86 instructions are limited to 15 bytes. */
- if (eip + size - ctxt->decode.eip_orig > 15)
+ if (eip + size - ctxt->eip > 15)
return X86EMUL_UNHANDLEABLE;
- eip += ctxt->cs_base;
while (size--) {
rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
*address = 0;
rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
ctxt->vcpu, NULL);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
switch (c->ad_bytes) {
case 2:
@@ -916,14 +924,18 @@ int
x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, group;
- /* Shadow copy of register state. Committed on successful emulation. */
+ /* we cannot decode insn before we complete previous rep insn */
+ WARN_ON(ctxt->restart);
+
+ /* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache));
- c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
+ c->fetch.start = c->fetch.end = c->eip;
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
@@ -1015,11 +1027,6 @@ done_prefixes:
}
}
- if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
- return -1;
- }
-
if (c->d & Group) {
group = c->d & GroupMask;
c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
rc = decode_modrm(ctxt, ops);
else if (c->d & MemAbs)
rc = decode_abs(ctxt, ops);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
if (c->ad_bytes != 8)
c->modrm_ea = (u32)c->modrm_ea;
+
+ if (c->rip_relative)
+ c->modrm_ea += c->eip;
+
/*
* Decode and fetch the source operand: register, memory
* or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
break;
}
c->src.type = OP_MEM;
+ c->src.ptr = (unsigned long *)c->modrm_ea;
+ c->src.val = 0;
break;
case SrcImm:
case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
c->src.bytes = 1;
c->src.val = 1;
break;
+ case SrcSI:
+ c->src.type = OP_MEM;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = (unsigned long *)
+ register_address(c, seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]);
+ c->src.val = 0;
+ break;
}
/*
@@ -1168,6 +1189,12 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = 1;
break;
+ case Src2Mem16:
+ c->src2.type = OP_MEM;
+ c->src2.bytes = 2;
+ c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
+ c->src2.val = 0;
+ break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
case DstMem:
+ case DstMem64:
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
break;
}
c->dst.type = OP_MEM;
+ c->dst.ptr = (unsigned long *)c->modrm_ea;
+ if ((c->d & DstMask) == DstMem64)
+ c->dst.bytes = 8;
+ else
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.val = 0;
+ if (c->d & BitOp) {
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+ c->dst.ptr = (void *)c->dst.ptr +
+ (c->src.val & mask) / 8;
+ }
break;
case DstAcc:
c->dst.type = OP_REG;
- c->dst.bytes = c->op_bytes;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->op_bytes) {
+ switch (c->dst.bytes) {
case 1:
c->dst.val = *(u8 *)c->dst.ptr;
break;
@@ -1203,18 +1243,248 @@ done_prefixes:
case 4:
c->dst.val = *(u32 *)c->dst.ptr;
break;
+ case 8:
+ c->dst.val = *(u64 *)c->dst.ptr;
+ break;
}
c->dst.orig_val = c->dst.val;
break;
+ case DstDI:
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)
+ register_address(c, es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = 0;
+ break;
}
- if (c->rip_relative)
- c->modrm_ea += c->eip;
-
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->decode.io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ struct decode_cache *c = &ctxt->decode;
+ unsigned int in_page, n;
+ unsigned int count = c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+ in_page = (ctxt->eflags & EFLG_DF) ?
+ offset_in_page(c->regs[VCPU_REGS_RDI]) :
+ PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+ n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+ count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ return 0;
+ rc->end = n * size;
+ }
+
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ return 1;
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_ptr *dt)
+{
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ memset (dt, 0, sizeof *dt);
+ if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc);
+ } else
+ ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ int ret;
+ u32 err;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7) {
+ kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ addr = dt.address + index * 8;
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+ return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ u32 err;
+ ulong addr;
+ int ret;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7) {
+ kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ addr = dt.address + index * 8;
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+ return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, int seg)
+{
+ struct desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ memset(&seg_desc, 0, sizeof seg_desc);
+
+ if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+ || ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ goto load;
+ }
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ if (null_selector) /* for NULL selector skip all following checks */
+ goto load;
+
+ ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ goto exception;
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = seg_desc.dpl;
+ cpl = ops->cpl(ctxt->vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+load:
+ ops->set_segment_selector(selector, seg, ctxt->vcpu);
+ ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
int rc;
unsigned long val, change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+ int cpl = ops->cpl(ctxt->vcpu);
rc = emulate_pop(ctxt, ops, &val, len);
if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
int rc;
rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+ rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
return rc;
}
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
}
rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
break;
--reg;
}
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc;
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
- if (rc != 0)
- return rc;
- return 0;
+ return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
}
static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
emulate_1op("neg", c->dst, ctxt->eflags);
break;
default:
- DPRINTF("Cannot emulate %02x\n", c->b);
- rc = X86EMUL_UNHANDLEABLE;
- break;
+ return 0;
}
- return rc;
+ return 1;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
emulate_push(ctxt);
break;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long memop)
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- u64 old, new;
- int rc;
-
- rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ u64 old = c->dst.orig_val;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
-
} else {
- new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
(u32) c->regs[VCPU_REGS_RBX];
- rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
ctxt->eflags |= EFLG_ZF;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
unsigned long cs;
rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
if (c->op_bytes == 4)
c->eip = (u32)c->eip;
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
default:
break;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
/* syscall is not available in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_PROPAGATE_FAULT;
}
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_PROPAGATE_FAULT;
}
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
int iopl;
if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+ return ops->cpl(ctxt->vcpu) > iopl;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
return false;
return true;
}
+static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ int seg)
+{
+ struct desc_struct desc;
+ if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
+ return get_desc_base(&desc);
+ else
+ return ~0;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->ip = c->eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = c->regs[VCPU_REGS_RAX];
+ tss->cx = c->regs[VCPU_REGS_RCX];
+ tss->dx = c->regs[VCPU_REGS_RDX];
+ tss->bx = c->regs[VCPU_REGS_RBX];
+ tss->sp = c->regs[VCPU_REGS_RSP];
+ tss->bp = c->regs[VCPU_REGS_RBP];
+ tss->si = c->regs[VCPU_REGS_RSI];
+ tss->di = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ c->eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ c->regs[VCPU_REGS_RAX] = tss->ax;
+ c->regs[VCPU_REGS_RCX] = tss->cx;
+ c->regs[VCPU_REGS_RDX] = tss->dx;
+ c->regs[VCPU_REGS_RBX] = tss->bx;
+ c->regs[VCPU_REGS_RSP] = tss->sp;
+ c->regs[VCPU_REGS_RBP] = tss->bp;
+ c->regs[VCPU_REGS_RSI] = tss->si;
+ c->regs[VCPU_REGS_RDI] = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 err, new_tss_base = get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ save_state_to_tss16(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+ }
+
+ return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+ tss->eip = c->eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = c->regs[VCPU_REGS_RAX];
+ tss->ecx = c->regs[VCPU_REGS_RCX];
+ tss->edx = c->regs[VCPU_REGS_RDX];
+ tss->ebx = c->regs[VCPU_REGS_RBX];
+ tss->esp = c->regs[VCPU_REGS_RSP];
+ tss->ebp = c->regs[VCPU_REGS_RBP];
+ tss->esi = c->regs[VCPU_REGS_RSI];
+ tss->edi = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+ tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+ tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ ops->set_cr(3, tss->cr3, ctxt->vcpu);
+ c->eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+ c->regs[VCPU_REGS_RAX] = tss->eax;
+ c->regs[VCPU_REGS_RCX] = tss->ecx;
+ c->regs[VCPU_REGS_RDX] = tss->edx;
+ c->regs[VCPU_REGS_RBX] = tss->ebx;
+ c->regs[VCPU_REGS_RSP] = tss->esp;
+ c->regs[VCPU_REGS_RBP] = tss->ebp;
+ c->regs[VCPU_REGS_RSI] = tss->esi;
+ c->regs[VCPU_REGS_RDI] = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+ ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 err, new_tss_base = get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ save_state_to_tss32(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+ }
+
+ return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ ulong old_tss_base =
+ get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ u32 desc_limit;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ if (reason != TASK_SWITCH_IRET) {
+ if ((tss_selector & 3) > next_tss_desc.dpl ||
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
+ tss_selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, ops, old_tss_sel,
+ &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, ops, tss_selector,
+ &next_tss_desc);
+ }
+
+ ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+ ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+ if (has_error_code) {
+ struct decode_cache *c = &ctxt->decode;
+
+ c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ c->lock_prefix = 0;
+ c->src.val = (unsigned long) error_code;
+ emulate_push(ctxt);
+ }
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ memset(c, 0, sizeof(struct decode_cache));
+ c->eip = ctxt->eip;
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ c->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ rc = writeback(ctxt, ops);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+ int reg, struct operand *op)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+ register_address_increment(c, &c->regs[reg], df * op->bytes);
+ op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
- unsigned long memop = 0;
u64 msr_data;
- unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
- unsigned int port;
- int io_dir_in;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = c->dst.type;
ctxt->interruptibility = 0;
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
*/
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
- saved_eip = c->eip;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
/* LOCK prefix is allowed only with some instructions */
- if (c->lock_prefix && !(c->d & Lock)) {
+ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
- memop = c->modrm_ea;
-
if (c->rep_prefix && (c->d & String)) {
+ ctxt->restart = true;
/* All REP prefixes have the same first termination condition */
- if (c->regs[VCPU_REGS_RCX] == 0) {
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+ string_done:
+ ctxt->restart = false;
kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
* - if REPNE/REPNZ and ZF = 1 then done
*/
if ((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf)) {
+ (c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
+ ((ctxt->eflags & EFLG_ZF) == 0))
+ goto string_done;
if ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
+ goto string_done;
}
- c->regs[VCPU_REGS_RCX]--;
- c->eip = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
}
if (c->src.type == OP_MEM) {
- c->src.ptr = (unsigned long *)memop;
- c->src.val = 0;
rc = ops->read_emulated((unsigned long)c->src.ptr,
&c->src.val,
c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->src.orig_val = c->src.val;
}
+ if (c->src2.type == OP_MEM) {
+ rc = ops->read_emulated((unsigned long)c->src2.ptr,
+ &c->src2.val,
+ c->src2.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if ((c->d & DstMask) == ImplicitOps)
goto special_insn;
- if (c->dst.type == OP_MEM) {
- c->dst.ptr = (unsigned long *)memop;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- if (!(c->d & Mov)) {
- /* optimisation - avoid slow emulated read */
- rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
+ if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
}
c->dst.orig_val = c->dst.val;
@@ -1926,7 +2575,7 @@ special_insn:
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
+ c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->dst.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio_string(ctxt->vcpu,
- 1,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c, es_base(ctxt),
- c->regs[VCPU_REGS_RDI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
+ c->regs[VCPU_REGS_RDX], &c->dst.val))
+ goto done; /* IO is needed, skip writeback */
+ break;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
+ c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->src.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio_string(ctxt->vcpu,
- 0,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
+ &c->src.val, 1, ctxt->vcpu);
+
+ c->dst.type = OP_NONE; /* nothing to writeback */
+ break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
case 0x8c: { /* mov r/m, sreg */
struct kvm_segment segreg;
- if (c->modrm_reg <= 5)
+ if (c->modrm_reg <= VCPU_SREG_GS)
kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
else {
- printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
}
c->dst.val = segreg.selector;
break;
@@ -2132,16 +2762,16 @@ special_insn:
}
if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+ toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
- rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+ rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
}
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
break;
case 0xa4 ... 0xa5: /* movs */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
+ goto mov;
case 0xa6 ... 0xa7: /* cmps */
- c->src.type = OP_NONE; /* Disable writeback. */
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]);
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
c->dst.type = OP_NONE; /* Disable writeback. */
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->src.bytes
- : c->src.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
-
- break;
+ goto cmp;
case 0xaa ... 0xab: /* stos */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
c->dst.val = c->regs[VCPU_REGS_RAX];
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
break;
case 0xac ... 0xad: /* lods */
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
+ goto mov;
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
break;
case 0xcb: /* ret far */
rc = emulate_ret_far(ctxt, ops);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
break;
case 0xe4: /* inb */
case 0xe5: /* in */
- port = c->src.val;
- io_dir_in = 1;
- goto do_io;
+ goto do_io_in;
case 0xe6: /* outb */
case 0xe7: /* out */
- port = c->src.val;
- io_dir_in = 0;
- goto do_io;
+ goto do_io_out;
case 0xe8: /* call (near) */ {
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
case 0xe9: /* jmp rel */
goto jmp;
case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
- VCPU_SREG_CS))
+ jump_far:
+ if (load_segment_descriptor(ctxt, ops, c->src2.val,
+ VCPU_SREG_CS))
goto done;
c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 1;
- goto do_io;
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ do_io_in:
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+ &c->dst.val))
+ goto done; /* IO is needed */
+ break;
case 0xee: /* out al,dx */
case 0xef: /* out (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 0;
- do_io:
- if (!emulator_io_permited(ctxt, ops, port,
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ do_io_out:
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- port) != 0) {
- c->eip = saved_eip;
- goto cannot_emulate;
- }
+ ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
+ ctxt->vcpu);
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf6 ... 0xf7: /* Grp3 */
- rc = emulate_grp3(ctxt, ops);
- if (rc != 0)
- goto done;
+ if (!emulate_grp3(ctxt, ops))
+ goto cannot_emulate;
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
kvm_inject_gp(ctxt->vcpu, 0);
else {
ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
}
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
kvm_inject_gp(ctxt->vcpu, 0);
else {
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
@@ -2383,28 +2940,55 @@ special_insn:
ctxt->eflags |= EFLG_DF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
- case 0xfe ... 0xff: /* Grp4/Grp5 */
+ case 0xfe: /* Grp4 */
+ grp45:
rc = emulate_grp45(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
+ case 0xff: /* Grp5 */
+ if (c->modrm_reg == 5)
+ goto jump_far;
+ goto grp45;
}
writeback:
rc = writeback(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ c->dst.type = saved_dst_type;
+
+ if ((c->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
+ &c->src);
+
+ if ((c->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+
+ if (c->rep_prefix && (c->d & String)) {
+ struct read_cache *rc = &ctxt->decode.io_read;
+ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+ /*
+ * Re-enter guest when pio read ahead buffer is empty or,
+ * if it is not used, after each 1024 iteration.
+ */
+ if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+ (rc->end != 0 && rc->end == rc->pos))
+ ctxt->restart = false;
+ }
+
/* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(ctxt->vcpu, c->eip);
+ ops->set_rflags(ctxt->vcpu, ctxt->eflags);
done:
- if (rc == X86EMUL_UNHANDLEABLE) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
twobyte_insn:
switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
goto cannot_emulate;
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
/* Let the processor re-execute the fixed hypercall */
- c->eip = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
realmode_lgdt(ctxt->vcpu, size, address);
/* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
switch (c->modrm_rm) {
case 1:
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address,
c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
realmode_lidt(ctxt->vcpu, size, address);
}
@@ -2459,15 +3043,18 @@ twobyte_insn:
break;
case 4: /* smsw */
c->dst.bytes = 2;
- c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+ c->dst.val = ops->get_cr(0, ctxt->vcpu);
break;
case 6: /* lmsw */
- realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
- &ctxt->eflags);
+ ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+ (c->src.val & 0x0f), ctxt->vcpu);
c->dst.type = OP_NONE;
break;
+ case 5: /* not defined */
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, memop);
+ emulate_invlpg(ctxt->vcpu, c->modrm_ea);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x20: /* mov cr, reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- c->regs[c->modrm_rm] =
- realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+ switch (c->modrm_reg) {
+ case 1:
+ case 5 ... 7:
+ case 9 ... 15:
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x21: /* mov from dr to reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- realmode_set_cr(ctxt->vcpu,
- c->modrm_reg, c->modrm_val, &ctxt->eflags);
+ ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_set_dr(ctxt, c->modrm_reg,
- c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
- if (rc) {
+ if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
+ goto done;
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
case 0x32:
/* rdmsr */
- rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
- if (rc) {
+ if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
+ goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
(u64) c->src.val;
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops, memop);
- if (rc != 0)
+ rc = emulate_grp9(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
goto done;
- c->dst.type = OP_NONE;
break;
}
goto writeback;
cannot_emulate:
DPRINTF("Cannot emulate %02x\n", c->b);
- c->eip = saved_eip;
return -1;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa1..93825ff 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
#include <linux/kvm_host.h>
#include "trace.h"
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ raw_spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+
+ s->wakeup_needed = false;
+
+ raw_spin_unlock(&s->lock);
+
+ if (wakeup) {
+ vcpu = s->kvm->bsp_vcpu;
+ if (vcpu)
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
* Other interrupt may be delivered to PIC while lock is dropped but
* it should be safe since PIC state is already updated at this stage.
*/
- raw_spin_unlock(&s->pics_state->lock);
+ pic_unlock(s->pics_state);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- raw_spin_lock(&s->pics_state->lock);
+ pic_lock(s->pics_state);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- raw_spin_lock(&s->lock);
+ pic_lock(s);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
}
/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- raw_spin_lock(&s->lock);
+ pic_lock(s);
pic_update_irq(s);
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- raw_spin_lock(&s->lock);
+ pic_lock(s);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return ret;
}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- raw_spin_lock(&s->lock);
+ pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return intno;
}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- raw_spin_lock(&s->lock);
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return 0;
}
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- raw_spin_lock(&s->lock);
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return 0;
}
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- kvm_vcpu_kick(vcpu);
+ s->wakeup_needed = true;
}
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915..cd1f362 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
struct kvm_pic {
raw_spinlock_t lock;
+ bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524..64bc6ea 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
};
struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
+ bool (*is_periodic)(struct kvm_timer *);
};
-
enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906..81563e7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
#include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-
-struct kvm_unsync_walk {
- int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static int is_write_protection(struct kvm_vcpu *vcpu)
+static bool is_write_protection(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
- set_page_private(page, 0);
cache->objects[cache->nobjs++] = page_address(page);
}
return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int i;
gfn = unalias_gfn(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- slot = gfn_to_memslot_unaliased(kvm, gfn);
write_count = slot_largepage_idx(gfn, slot, i);
*write_count -= 1;
WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
u64 *prev_spte;
int i;
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
return NULL;
}
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_desc = NULL;
prev_spte = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int retval = 0;
struct kvm_memslots *slots;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; i++) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&sp->oos_link);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
}
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ fn(parent_sp);
+ mmu_parent_walk(parent_sp, fn);
return;
}
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!pte_chain->parent_ptes[i])
break;
parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ fn(parent_sp);
+ mmu_parent_walk(parent_sp, fn);
}
}
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
}
}
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int unsync_walk_fn(struct kvm_mmu_page *sp)
{
kvm_mmu_update_parents_unsync(sp);
return 1;
}
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ mmu_parent_walk(sp, unsync_walk_fn);
kvm_mmu_update_parents_unsync(sp);
}
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
}
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
- trace_kvm_mmu_sync_page(sp);
if (rmap_write_protect(vcpu->kvm, sp->gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu.base_role;
role.level = level;
role.direct = direct;
+ if (role.direct)
+ role.cr4_pae = 0;
role.access = access;
if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
if (sp->unsync_children) {
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
trace_kvm_mmu_get_page(sp, false);
return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
r = 0;
index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
+restart:
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.direct) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
if (kvm_mmu_zap_page(kvm, sp))
- n = bucket->first;
+ goto restart;
}
return r;
}
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
+restart:
hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
if (sp->gfn == gfn && !sp->role.direct
&& !sp->role.invalid) {
pgprintk("%s: zap %lx %x\n",
__func__, gfn, sp->role.word);
if (kvm_mmu_zap_page(kvm, sp))
- nn = bucket->first;
+ goto restart;
}
}
}
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
}
}
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
- struct page *page;
-
- gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- if (gpa == UNMAPPED_GVA)
- return NULL;
-
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
- return page;
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
struct kvm_mmu_page *s;
struct hlist_node *node, *n;
- trace_kvm_mmu_unsync_page(sp);
index = kvm_page_table_hashfn(sp->gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (s->role.word != sp->role.word)
return 1;
}
+ trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
mmu_convert_notrap(sp);
return 0;
@@ -2081,21 +2059,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- if (tdp_enabled)
- direct = 1;
if (mmu_check_root(vcpu, root_gfn))
return 1;
+ if (tdp_enabled) {
+ direct = 1;
+ root_gfn = 0;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = root;
return 0;
}
direct = !is_paging(vcpu);
- if (tdp_enabled)
- direct = 1;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -2111,11 +2091,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = 0;
if (mmu_check_root(vcpu, root_gfn))
return 1;
+ if (tdp_enabled) {
+ direct = 1;
+ root_gfn = i << 30;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2286,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
/* no rsvd bits for 2 level 4K page table entries */
context->rsvd_bits_mask[0][1] = 0;
context->rsvd_bits_mask[0][0] = 0;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ if (!is_pse(vcpu)) {
+ context->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2311,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2329,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
break;
}
}
@@ -2438,7 +2431,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
else
r = paging32_init_context(vcpu);
- vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+ vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+ vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
return r;
}
@@ -2478,7 +2472,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
goto out;
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
r = mmu_alloc_roots(vcpu);
+ spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
if (r)
@@ -2527,7 +2523,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- if (sp->role.glevels == PT32_ROOT_LEVEL)
+ if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
else
paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
}
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ u64 gpte)
{
gfn_t gfn;
- int r;
- u64 gpte = 0;
pfn_t pfn;
- if (bytes != 4 && bytes != 8)
- return;
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if (is_pae(vcpu)) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if ((bytes == 4) && (gpa % 4 == 0)) {
- r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
- if (r)
- return;
- memcpy((void *)&gpte + (gpa % 8), new, 4);
- } else if ((bytes == 8) && (gpa % 8 == 0)) {
- memcpy((void *)&gpte, new, 8);
- }
- } else {
- if ((bytes == 4) && (gpa % 4 == 0))
- memcpy((void *)&gpte, new, 4);
- }
if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2611,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int flooded = 0;
int npte;
int r;
+ int invlpg_counter;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+ invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if ((is_pae(vcpu) && bytes == 4) || !new) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if (is_pae(vcpu)) {
+ gpa &= ~(gpa_t)7;
+ bytes = 8;
+ }
+ r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+ if (r)
+ gentry = 0;
+ new = (const u8 *)&gentry;
+ }
+
+ switch (bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
+ }
+
+ mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
spin_lock(&vcpu->kvm->mmu_lock);
+ if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+ gentry = 0;
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2669,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+
+restart:
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
continue;
- pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+ pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) {
@@ -2682,14 +2691,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
if (kvm_mmu_zap_page(vcpu->kvm, sp))
- n = bucket->first;
+ goto restart;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
page_offset = offset;
level = sp->role.level;
npte = 1;
- if (sp->role.glevels == PT32_ROOT_LEVEL) {
+ if (!sp->role.cr4_pae) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2716,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
- if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
- gentry = 0;
- r = kvm_read_guest_atomic(vcpu->kvm,
- gpa & ~(u64)(pte_size - 1),
- &gentry, pte_size);
- new = (const void *)&gentry;
- if (r < 0)
- new = NULL;
- }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (new)
- mmu_pte_write_new_pte(vcpu, sp, spte, new);
+ if (gentry)
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
@@ -2900,22 +2900,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *sp, *node;
spin_lock(&kvm->mmu_lock);
+restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
if (kvm_mmu_zap_page(kvm, sp))
- node = container_of(kvm->arch.active_mmu_pages.next,
- struct kvm_mmu_page, link);
+ goto restart;
+
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
}
-static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
+ return kvm_mmu_zap_page(kvm, page) + 1;
}
static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2928,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages, idx;
+ int npages, idx, freed_pages;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2936,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
kvm->arch.n_free_mmu_pages;
cache_count += npages;
if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- kvm_mmu_remove_one_alloc_mmu_page(kvm);
- cache_count--;
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+ cache_count -= freed_pages;
kvm_freed = kvm;
}
nr_to_scan--;
@@ -3011,7 +3012,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
unsigned int nr_pages = 0;
struct kvm_memslots *slots;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
+
for (i = 0; i < slots->nmemslots; i++)
nr_pages += slots->memslots[i].npages;
@@ -3174,8 +3176,7 @@ static gva_t canonicalize(gva_t gva)
}
-typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
inspect_spte_fn fn)
@@ -3191,7 +3192,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
child = page_header(ent & PT64_BASE_ADDR_MASK);
__mmu_spte_walk(kvm, child, fn);
} else
- fn(kvm, sp, &sp->spt[i]);
+ fn(kvm, &sp->spt[i]);
}
}
}
@@ -3282,11 +3283,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
static int count_rmaps(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots;
int nmaps = 0;
int i, j, k, idx;
idx = srcu_read_lock(&kvm->srcu);
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
struct kvm_memory_slot *m = &slots->memslots[i];
struct kvm_rmap_desc *d;
@@ -3315,7 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
return nmaps;
}
-void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3334,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
printk(KERN_ERR "%s: no memslot for gfn %ld\n",
audit_msg, gfn);
printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
- audit_msg, sptep - rev_sp->spt,
+ audit_msg, (long int)(sptep - rev_sp->spt),
rev_sp->gfn);
dump_stack();
return;
}
rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- is_large_pte(*sptep));
+ rev_sp->role.level);
if (!*rmapp) {
if (!printk_ratelimit())
return;
@@ -3373,7 +3376,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
continue;
if (!(ent & PT_WRITABLE_MASK))
continue;
- inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+ inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
}
}
return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6..42f07b1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,14 +6,12 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE mmutrace
#define KVM_MMU_PAGE_FIELDS \
__field(__u64, gfn) \
__field(__u32, role) \
__field(__u32, root_count) \
- __field(__u32, unsync)
+ __field(bool, unsync)
#define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->gfn = sp->gfn; \
@@ -30,14 +28,14 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
+ trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
" %snxe root %u %s%c", \
- __entry->gfn, role.level, role.glevels, \
+ __entry->gfn, role.level, \
+ role.cr4_pae ? " pae" : "", \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
role.invalid ? " invalid" : "", \
- role.cr4_pge ? "" : "!", \
role.nxe ? "" : "!", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
@@ -94,15 +92,15 @@ TRACE_EVENT(
TP_printk("pte %llx level %u", __entry->pte, __entry->level)
);
-/* We set a pte accessed bit */
-TRACE_EVENT(
- kvm_mmu_set_accessed_bit,
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
TP_ARGS(table_gfn, index, size),
TP_STRUCT__entry(
__field(__u64, gpa)
- ),
+ ),
TP_fast_assign(
__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -112,22 +110,20 @@ TRACE_EVENT(
TP_printk("gpa %llx", __entry->gpa)
);
-/* We set a pte dirty bit */
-TRACE_EVENT(
- kvm_mmu_set_dirty_bit,
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
- TP_ARGS(table_gfn, index, size),
- TP_STRUCT__entry(
- __field(__u64, gpa)
- ),
+ TP_ARGS(table_gfn, index, size)
+);
- TP_fast_assign(
- __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
- + index * size;
- ),
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
- TP_printk("gpa %llx", __entry->gpa)
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
);
TRACE_EVENT(
@@ -166,55 +162,45 @@ TRACE_EVENT(
__entry->created ? "new" : "existing")
);
-TRACE_EVENT(
- kvm_mmu_sync_page,
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
TP_PROTO(struct kvm_mmu_page *sp),
TP_ARGS(sp),
TP_STRUCT__entry(
KVM_MMU_PAGE_FIELDS
- ),
+ ),
TP_fast_assign(
KVM_MMU_PAGE_ASSIGN(sp)
- ),
+ ),
TP_printk("%s", KVM_MMU_PAGE_PRINTK())
);
-TRACE_EVENT(
- kvm_mmu_unsync_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
TP_PROTO(struct kvm_mmu_page *sp),
- TP_ARGS(sp),
-
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- ),
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- ),
-
- TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+ TP_ARGS(sp)
);
-TRACE_EVENT(
- kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
TP_PROTO(struct kvm_mmu_page *sp),
- TP_ARGS(sp),
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- ),
+ TP_ARGS(sp)
+);
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- ),
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
- TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+ TP_ARGS(sp)
);
-
#endif /* _TRACE_KVMMMU_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a..89d66ca 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
goto access_error;
#if PTTYPE == 64
- if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+ if (fetch_fault && (pte & PT64_NX_MASK))
goto access_error;
#endif
@@ -190,10 +190,10 @@ walk:
if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
((walker->level == PT_DIRECTORY_LEVEL) &&
- (pte & PT_PAGE_SIZE_MASK) &&
+ is_large_pte(pte) &&
(PTTYPE == 64 || is_pse(vcpu))) ||
((walker->level == PT_PDPE_LEVEL) &&
- (pte & PT_PAGE_SIZE_MASK) &&
+ is_large_pte(pte) &&
is_long_mode(vcpu))) {
int lvl = walker->level;
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
+ u64 new_spte;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte))
- __set_spte(spte, shadow_notrap_nonpresent_pte);
+ if (!is_present_gpte(gpte)) {
+ if (page->unsync)
+ new_spte = shadow_trap_nonpresent_pte;
+ else
+ new_spte = shadow_notrap_nonpresent_pte;
+ __set_spte(spte, new_spte);
+ }
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
@@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
- if (level == PT_PAGE_TABLE_LEVEL ||
- ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
- ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+ if (is_last_spte(*sptep, level)) {
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ int offset, shift;
+
+ shift = PAGE_SHIFT -
+ (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
+ offset = sp->role.quadrant << shift;
+
+ pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
+
+ atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (pte_gpa == -1)
+ return;
+
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
bool reset_host_protection;
+ gpa_t first_pte_gpa;
offset = nr_present = 0;
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
+ first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
unsigned pte_access;
pt_element_t gpte;
@@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (!is_shadow_present_pte(sp->spt[i]))
continue;
- pte_gpa = gfn_to_gpa(sp->gfn);
- pte_gpa += (i+offset) * sizeof(pt_element_t);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 737361f..96dc232 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
struct nested_state {
struct vmcb *hsave;
u64 hsave_msr;
+ u64 vm_cr_msr;
u64 vmcb;
/* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
+ u64 vmcb_iopm;
/* A VMEXIT is required but not yet emulated */
bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
};
+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
struct nested_state nested;
bool nmi_singlestep;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+};
+
+#define MSR_INVALID 0xffffffffU
+
+static struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+ { .index = MSR_K6_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_INVALID, .always = false },
};
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
#else
-static bool npt_enabled = false;
+static bool npt_enabled;
#endif
static int npt = 1;
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
u32 base3;
u32 zero1;
} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
#define MAX_INST_SIZE 15
static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
static inline void invlpga(unsigned long addr, u32 asid)
{
- asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
+ asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.efer = efer;
}
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /* If we are within a nested VM we'd better #VMEXIT and let the
- guest handle the exception */
- if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
static int is_external_interrupt(u32 info)
{
info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+ ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->vmcb->control.next_rip != 0)
+ svm->next_rip = svm->vmcb->control.next_rip;
+
if (!svm->next_rip) {
if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
EMULATE_DONE)
@@ -297,6 +337,43 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm_set_interrupt_shadow(vcpu, 0);
}
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code,
+ bool reinject)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If we are within a nested VM we'd better #VMEXIT and let the guest
+ * handle the exception
+ */
+ if (!reinject &&
+ nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+ unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+ /*
+ * For guest debugging where we have to reinject #BP if some
+ * INT3 is guest-owned:
+ * Emulate nRIP by moving RIP forward. Will fail if injection
+ * raises a fault that is not intercepted. Still better than
+ * failing in all cases.
+ */
+ skip_emulated_instruction(&svm->vcpu);
+ rip = kvm_rip_read(&svm->vcpu);
+ svm->int3_rip = rip + svm->vmcb->save.cs.base;
+ svm->int3_injected = rip - old_rip;
+ }
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
+}
+
static int has_svm(void)
{
const char *msg;
@@ -319,7 +396,7 @@ static int svm_hardware_enable(void *garbage)
struct svm_cpu_data *sd;
uint64_t efer;
- struct descriptor_table gdt_descr;
+ struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -344,8 +421,8 @@ static int svm_hardware_enable(void *garbage)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
- kvm_get_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.base;
+ native_store_gdt(&gdt_descr);
+ gdt = (struct desc_struct *)gdt_descr.address;
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +468,98 @@ err_1:
}
+static bool valid_msr_intercept(u32 index)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == index)
+ return true;
+
+ return false;
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
int i;
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr >= msrpm_ranges[i] &&
- msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
- msrpm_ranges[i]) * 2;
-
- u32 *base = msrpm + (msr_offset / 32);
- u32 msr_shift = msr_offset % 32;
- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
- *base = (*base & ~(0x3 << msr_shift)) |
- (mask << msr_shift);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+
+ set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
return;
- }
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
}
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
BUG();
}
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void init_msrpm_offsets(void)
{
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ int i;
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +600,8 @@ static __init int svm_hardware_setup(void)
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ init_msrpm_offsets();
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -523,7 +658,7 @@ static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
seg->limit = 0xffff;
seg->base = 0;
}
@@ -543,16 +678,16 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vcpu.fpu_active = 1;
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
+ control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
+ control->intercept_cr_write = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK |
INTERCEPT_CR8_MASK;
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
+ control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
@@ -561,7 +696,7 @@ static void init_vmcb(struct vcpu_svm *svm)
INTERCEPT_DR6_MASK |
INTERCEPT_DR7_MASK;
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
+ control->intercept_dr_write = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
@@ -575,7 +710,7 @@ static void init_vmcb(struct vcpu_svm *svm)
(1 << MC_VECTOR);
- control->intercept = (1ULL << INTERCEPT_INTR) |
+ control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
(1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +771,8 @@ static void init_vmcb(struct vcpu_svm *svm)
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
- /* This is the guest-visible cr0 value.
+ /*
+ * This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -729,6 +865,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm_vcpu_init_msrpm(svm->msrpm);
svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
@@ -882,7 +1019,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
- /* AMD's VMCB does not have an explicit unusable field, so emulate it
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
var->unusable = !var->present || (var->type == 0);
@@ -918,7 +1056,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->type |= 0x1;
break;
case VCPU_SREG_SS:
- /* On AMD CPUs sometimes the DB bit in the segment
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
* descriptor is left as 1, although the whole segment has
* been made unusable. Clear it here to pass an Intel VMX
* entry check when cross vendor migrating.
@@ -936,36 +1075,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->limit = svm->vmcb->save.idtr.limit;
- dt->base = svm->vmcb->save.idtr.base;
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
}
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.idtr.limit = dt->limit;
- svm->vmcb->save.idtr.base = dt->base ;
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
}
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->limit = svm->vmcb->save.gdtr.limit;
- dt->base = svm->vmcb->save.gdtr.base;
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
}
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.gdtr.limit = dt->limit;
- svm->vmcb->save.gdtr.base = dt->base ;
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -978,6 +1117,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void update_cr0_intercept(struct vcpu_svm *svm)
{
+ struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -989,11 +1129,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
+ vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+ }
} else {
svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
}
}
@@ -1001,6 +1155,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm)) {
+ /*
+ * We are here because we run in nested mode, the host kvm
+ * intercepts cr0 writes but the l1 hypervisor does not.
+ * But the L1 hypervisor may intercept selective cr0 writes.
+ * This needs to be checked here.
+ */
+ unsigned long old, new;
+
+ /* Remove bits that would trigger a real cr0 write intercept */
+ old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+ new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+ if (old == new) {
+ /* cr0 write with ts and mp unchanged */
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+ return;
+ }
+ }
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1134,70 +1309,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->vmcb->control.asid = sd->next_asid++;
}
-static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
- switch (dr) {
- case 0 ... 3:
- *dest = vcpu->arch.db[dr];
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 6:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *dest = vcpu->arch.dr6;
- else
- *dest = svm->vmcb->save.dr6;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 7:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *dest = vcpu->arch.dr7;
- else
- *dest = svm->vmcb->save.dr7;
- break;
- }
-
- return EMULATE_DONE;
-}
-
-static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = value;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = value;
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 6:
- vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 7:
- vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
- vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
- }
- break;
- }
-
- return EMULATE_DONE;
+ svm->vmcb->save.dr7 = value;
}
static int pf_interception(struct vcpu_svm *svm)
@@ -1234,7 +1350,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc =
svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1268,7 +1384,22 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+ u32 excp;
+
+ if (is_nested(svm)) {
+ u32 h_excp, n_excp;
+
+ h_excp = svm->nested.hsave->control.intercept_exceptions;
+ n_excp = svm->nested.intercept_exceptions;
+ h_excp &= ~(1 << NM_VECTOR);
+ excp = h_excp | n_excp;
+ } else {
+ excp = svm->vmcb->control.intercept_exceptions;
+ excp &= ~(1 << NM_VECTOR);
+ }
+
+ svm->vmcb->control.intercept_exceptions = excp;
+
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
}
@@ -1309,29 +1440,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
static int io_interception(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
-
- svm->next_rip = svm->vmcb->control.exit_info_2;
-
string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
- if (string) {
- if (emulate_instruction(&svm->vcpu,
- 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ if (string || in)
+ return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
+ svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_pio(&svm->vcpu, in, size, port);
+
+ return kvm_fast_pio_out(vcpu, size, port);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -1384,6 +1509,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
+ int vmexit;
+
if (!is_nested(svm))
return 0;
@@ -1392,21 +1519,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- return nested_svm_exit_handled(svm);
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit == NESTED_EXIT_DONE)
+ svm->nested.exit_required = true;
+
+ return vmexit;
}
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
if (!is_nested(svm))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
+ return false;
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
if (svm->nested.intercept & 1ULL) {
/*
@@ -1417,21 +1551,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
*/
svm->nested.exit_required = true;
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return 1;
+ return false;
}
- return 0;
+ return true;
+}
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_nested(svm))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
{
struct page *page;
+ might_sleep();
+
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
- return kmap_atomic(page, idx);
+ *_page = page;
+
+ return kmap(page);
error:
kvm_release_page_clean(page);
@@ -1440,61 +1593,55 @@ error:
return NULL;
}
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(struct page *page)
{
- struct page *page;
+ kunmap(page);
+ kvm_release_page_dirty(page);
+}
- if (!addr)
- return;
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port;
+ u8 val, bit;
+ u64 gpa;
- page = kmap_atomic_to_page(addr);
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
- kunmap_atomic(addr, idx);
- kvm_release_page_dirty(page);
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ bit = port % 8;
+ val = 0;
+
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+ val &= (1 << bit);
+
+ return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 param = svm->vmcb->control.exit_info_1 & 1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- bool ret = false;
- u32 t0, t1;
- u8 *msrpm;
+ u32 offset, msr, value;
+ int write, mask;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return false;
+ return NESTED_EXIT_HOST;
- msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
- if (!msrpm)
- goto out;
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
- switch (msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- ret = true;
- goto out;
- }
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
- ret = msrpm[t1] & ((1 << param) << t0);
-
-out:
- nested_svm_unmap(msrpm, KM_USER0);
+ if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
- return ret;
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1504,17 +1651,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
switch (exit_code) {
case SVM_EXIT_INTR:
case SVM_EXIT_NMI:
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
return NESTED_EXIT_HOST;
- /* For now we are always handling NPFs when using them */
case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
if (npt_enabled)
return NESTED_EXIT_HOST;
break;
- /* When we're shadowing, trap PFs */
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* When we're shadowing, trap PFs */
if (!npt_enabled)
return NESTED_EXIT_HOST;
break;
+ case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+ nm_interception(svm);
+ break;
default:
break;
}
@@ -1525,7 +1676,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
/*
* If this function returns true, this #vmexit was already handled
*/
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int nested_svm_intercept(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
@@ -1534,6 +1685,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
if (svm->nested.intercept_cr_read & cr_bits)
@@ -1564,6 +1718,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
vmexit = NESTED_EXIT_DONE;
break;
}
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
default: {
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
if (svm->nested.intercept & exit_bits)
@@ -1571,9 +1729,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
}
}
- if (vmexit == NESTED_EXIT_DONE) {
+ return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
nested_svm_vmexit(svm);
- }
return vmexit;
}
@@ -1615,6 +1781,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
@@ -1622,10 +1789,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_int_info,
vmcb->control.exit_int_info_err);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
if (!nested_vmcb)
return 1;
+ /* Exit nested SVM mode */
+ svm->nested.vmcb = 0;
+
/* Give the current vmcb to the guest */
disable_gif(svm);
@@ -1635,9 +1805,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
- if (npt_enabled)
- nested_vmcb->save.cr3 = vmcb->save.cr3;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1709,10 +1880,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- /* Exit nested SVM mode */
- svm->nested.vmcb = 0;
-
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -1722,19 +1890,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- u32 *nested_msrpm;
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is omptimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
int i;
- nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
- if (!nested_msrpm)
- return false;
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
- for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
- nested_svm_unmap(nested_msrpm, KM_USER0);
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
return true;
}
@@ -1744,26 +1926,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return false;
- /* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested.vmcb = svm->vmcb->save.rax;
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
nested_vmcb->save.rip,
nested_vmcb->control.int_ctl,
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
+ nested_vmcb->control.intercept_cr_write,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
- /* Save the old vmcb, so we don't need to pick what we save, but
- can restore everything when a VMEXIT occurs */
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
hsave->save.es = vmcb->save.es;
hsave->save.cs = vmcb->save.cs;
hsave->save.ss = vmcb->save.ss;
@@ -1803,14 +1993,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled) {
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else {
+ } else
kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
- kvm_mmu_reset_context(&svm->vcpu);
- }
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1819,22 +2012,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
svm->vmcb->save.cpl = nested_vmcb->save.cpl;
- /* We don't want a nested guest to be more powerful than the guest,
- so all intercepts are ORed */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1851,13 +2030,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
else
svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of the guest */
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
+ /*
+ * We don't want a nested guest to be more powerful than the guest, so
+ * all intercepts are ORed
+ */
+ svm->vmcb->control.intercept_cr_read |=
+ nested_vmcb->control.intercept_cr_read;
+ svm->vmcb->control.intercept_cr_write |=
+ nested_vmcb->control.intercept_cr_write;
+ svm->vmcb->control.intercept_dr_read |=
+ nested_vmcb->control.intercept_dr_read;
+ svm->vmcb->control.intercept_dr_write |=
+ nested_vmcb->control.intercept_dr_write;
+ svm->vmcb->control.intercept_exceptions |=
+ nested_vmcb->control.intercept_exceptions;
+
+ svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+ svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
+
+ /* nested_vmcb is our indicator if nested SVM is activated */
+ svm->nested.vmcb = vmcb_gpa;
enable_gif(svm);
@@ -1883,6 +2092,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
+ struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -1890,12 +2100,12 @@ static int vmload_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
return 1;
}
@@ -1903,6 +2113,7 @@ static int vmload_interception(struct vcpu_svm *svm)
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
+ struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -1910,12 +2121,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
return 1;
}
@@ -2018,6 +2229,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
uint32_t idt_v =
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
tss_selector = (u16)svm->vmcb->control.exit_info_1;
@@ -2038,6 +2251,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
kvm_clear_exception_queue(&svm->vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
@@ -2054,7 +2273,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
(int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
skip_emulated_instruction(&svm->vcpu);
- return kvm_task_switch(&svm->vcpu, tss_selector, reason);
+ if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ svm->vcpu.run->internal.ndata = 0;
+ return 0;
+ }
+ return 1;
}
static int cpuid_interception(struct vcpu_svm *svm)
@@ -2145,9 +2371,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_SYSENTER_ESP:
*data = svm->sysenter_esp;
break;
- /* Nobody will change the following 5 values in the VMCB so
- we can safely return them on rdmsr. They will always be 0
- until LBRV is implemented. */
+ /*
+ * Nobody will change the following 5 values in the VMCB so we can
+ * safely return them on rdmsr. They will always be 0 until LBRV is
+ * implemented.
+ */
case MSR_IA32_DEBUGCTLMSR:
*data = svm->vmcb->save.dbgctl;
break;
@@ -2167,7 +2395,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = 0;
+ *data = svm->nested.vm_cr_msr;
break;
case MSR_IA32_UCODE_REV:
*data = 0x01000065;
@@ -2197,6 +2425,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
return 1;
}
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2263,6 +2516,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->nested.hsave_msr = data;
break;
case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
@@ -2326,16 +2580,16 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
+ [SVM_EXIT_READ_CR0] = emulate_on_interception,
+ [SVM_EXIT_READ_CR3] = emulate_on_interception,
+ [SVM_EXIT_READ_CR4] = emulate_on_interception,
+ [SVM_EXIT_READ_CR8] = emulate_on_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = emulate_on_interception,
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
[SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2354,15 +2608,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
- /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
@@ -2370,7 +2623,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
- [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2393,7 +2646,12 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+ trace_kvm_exit(exit_code, vcpu);
+
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
if (unlikely(svm->nested.exit_required)) {
nested_svm_vmexit(svm);
@@ -2422,11 +2680,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2511,6 +2764,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
if (irr == -1)
return;
@@ -2522,8 +2778,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ int ret;
+ ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+ return ret;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2568,13 +2828,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
-
- /* In case GIF=0 we can't rely on the CPU to tell us when
- * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
- * The next time we get that intercept, this function will be
- * called again though and we'll get the vintr intercept. */
- if (gif_set(svm)) {
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept.
+ */
+ if (gif_set(svm) && nested_svm_intr(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -2588,9 +2848,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
- /* Something prevents NMI from been injected. Single step over
- possible problem (IRET or exception injection or interrupt
- shadow) */
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
update_db_intercept(vcpu);
@@ -2614,6 +2875,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
@@ -2625,6 +2889,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
cr8 = kvm_get_cr8(vcpu);
svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2635,6 +2902,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ unsigned int3_injected = svm->int3_injected;
+
+ svm->int3_injected = 0;
if (svm->vcpu.arch.hflags & HF_IRET_MASK)
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2654,18 +2924,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
- /* In case of software exception do not reinject an exception
- vector, but re-execute and instruction instead */
- if (is_nested(svm))
- break;
- if (kvm_exception_is_soft(vector))
+ /*
+ * In case of software exceptions, do not reinject the vector,
+ * but re-execute the instruction instead. Rewind RIP first
+ * if we emulated INT3 before.
+ */
+ if (kvm_exception_is_soft(vector)) {
+ if (vector == BP_VECTOR && int3_injected &&
+ kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+ kvm_rip_write(&svm->vcpu,
+ kvm_rip_read(&svm->vcpu) -
+ int3_injected);
break;
+ }
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_queue_exception_e(&svm->vcpu, vector, err);
+ kvm_requeue_exception_e(&svm->vcpu, vector, err);
} else
- kvm_queue_exception(&svm->vcpu, vector);
+ kvm_requeue_exception(&svm->vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -2688,6 +2965,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
u16 gs_selector;
u16 ldt_selector;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
/*
* A vmexit emulation is required before the vcpu can be executed
* again.
@@ -2695,10 +2976,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->nested.exit_required))
return;
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
@@ -2879,25 +3156,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
}
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ switch (func) {
+ case 0x8000000A:
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ entry->edx = 0; /* Do not support any additional features */
+
+ break;
+ }
+}
+
static const struct trace_print_flags svm_exit_reasons_str[] = {
- { SVM_EXIT_READ_CR0, "read_cr0" },
- { SVM_EXIT_READ_CR3, "read_cr3" },
- { SVM_EXIT_READ_CR4, "read_cr4" },
- { SVM_EXIT_READ_CR8, "read_cr8" },
- { SVM_EXIT_WRITE_CR0, "write_cr0" },
- { SVM_EXIT_WRITE_CR3, "write_cr3" },
- { SVM_EXIT_WRITE_CR4, "write_cr4" },
- { SVM_EXIT_WRITE_CR8, "write_cr8" },
- { SVM_EXIT_READ_DR0, "read_dr0" },
- { SVM_EXIT_READ_DR1, "read_dr1" },
- { SVM_EXIT_READ_DR2, "read_dr2" },
- { SVM_EXIT_READ_DR3, "read_dr3" },
- { SVM_EXIT_WRITE_DR0, "write_dr0" },
- { SVM_EXIT_WRITE_DR1, "write_dr1" },
- { SVM_EXIT_WRITE_DR2, "write_dr2" },
- { SVM_EXIT_WRITE_DR3, "write_dr3" },
- { SVM_EXIT_WRITE_DR5, "write_dr5" },
- { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
{ SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
{ SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2946,8 +3237,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- update_cr0_intercept(svm);
svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+ if (is_nested(svm))
+ svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ update_cr0_intercept(svm);
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -2986,8 +3279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .get_dr = svm_get_dr,
- .set_dr = svm_set_dr,
+ .set_dr7 = svm_set_dr7,
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -3023,12 +3315,14 @@ static struct kvm_x86_ops svm_x86_ops = {
.cpuid_update = svm_cpuid_update,
.rdtscp_supported = svm_rdtscp_supported,
+
+ .set_supported_cpuid = svm_set_supported_cpuid,
};
static int __init svm_init(void)
{
return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
- THIS_MODULE);
+ __alignof__(struct vcpu_svm), THIS_MODULE);
}
static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea4043..4ddadb1 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
/*
* There is a race window between reading and incrementing, but we do
* not care about potentially loosing timer events in the !reinject
- * case anyway.
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
*/
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a2..a6544b8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#define TRACE_INCLUDE_FILE trace
/*
* Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
- TP_ARGS(exit_reason, guest_rip),
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
+ TP_ARGS(exit_reason, vcpu),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
TP_fast_assign(
__entry->exit_reason = exit_reason;
- __entry->guest_rip = guest_rip;
+ __entry->guest_rip = kvm_rip_read(vcpu);
),
TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
TP_printk("irq %u", __entry->irq)
);
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("%s (0x%x)",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ /* FIXME: don't print error_code if not present */
+ __entry->has_error ? __entry->error_code : 0)
+);
+
/*
* Tracepoint for page fault.
*/
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
),
TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
- "event_inj: 0x%08x npt: %s\n",
+ "event_inj: 0x%08x npt: %s",
__entry->rip, __entry->vmcb, __entry->nested_rip,
__entry->int_ctl, __entry->event_inj,
__entry->npt ? "on" : "off")
);
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u64, intercept )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept = intercept;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept)
+);
/*
* Tracepoint for #VMEXIT while nested
*/
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
__entry->exit_int_info_err = exit_int_info_err;
),
TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
__entry->rip,
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
),
TP_printk("reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
__entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
__entry->rip = rip
),
- TP_printk("rip: 0x%016llx\n", __entry->rip)
+ TP_printk("rip: 0x%016llx", __entry->rip)
);
/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
__entry->address = address;
),
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
__entry->rip, __entry->asid, __entry->address)
);
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
__entry->slb = slb;
),
- TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
__entry->rip, __entry->slb)
);
+#define __print_insn(insn, ilen) ({ \
+ int i; \
+ const char *ret = p->buffer + p->len; \
+ \
+ for (i = 0; i < ilen; ++i) \
+ trace_seq_printf(p, " %02x", insn[i]); \
+ trace_seq_printf(p, "%c", 0); \
+ ret; \
+ })
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+ __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt.decode.eip
+ - vcpu->arch.emulate_ctxt.decode.fetch.start;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt.decode.fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_insn(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
#endif /* _TRACE_KVM_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index edca080..859a01a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
#include <linux/moduleparam.h>
#include <linux/ftrace_event.h>
#include <linux/slab.h>
+#include <linux/tboot.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
+#define NR_AUTOLOAD_MSRS 1
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -125,6 +128,11 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
+ struct msr_autoload {
+ unsigned nr;
+ struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+ } msr_autoload;
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
@@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = {
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-static inline int is_page_fault(u32 intr_info)
+static inline bool is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_no_device(u32 intr_info)
+static inline bool is_no_device(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_invalid_opcode(u32 intr_info)
+static inline bool is_invalid_opcode(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_external_interrupt(u32 intr_info)
+static inline bool is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
-static inline int is_machine_check(u32 intr_info)
+static inline bool is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int cpu_has_vmx_msr_bitmap(void)
+static inline bool cpu_has_vmx_msr_bitmap(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
-static inline int cpu_has_vmx_tpr_shadow(void)
+static inline bool cpu_has_vmx_tpr_shadow(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
{
return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
-static inline int cpu_has_secondary_exec_ctrls(void)
+static inline bool cpu_has_secondary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
static inline bool cpu_has_vmx_ept_execute_only(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
static inline bool cpu_has_vmx_eptp_uncacheable(void)
{
- return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+ return vmx_capability.ept & VMX_EPTP_UC_BIT;
}
static inline bool cpu_has_vmx_eptp_writeback(void)
{
- return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
}
static inline bool cpu_has_vmx_ept_2m_page(void)
{
- return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
}
static inline bool cpu_has_vmx_ept_1g_page(void)
{
- return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
-static inline int cpu_has_vmx_invept_individual_addr(void)
+static inline bool cpu_has_vmx_invept_individual_addr(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
}
-static inline int cpu_has_vmx_invept_context(void)
+static inline bool cpu_has_vmx_invept_context(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
}
-static inline int cpu_has_vmx_invept_global(void)
+static inline bool cpu_has_vmx_invept_global(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
-static inline int cpu_has_vmx_ept(void)
+static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_EPT;
}
-static inline int cpu_has_vmx_unrestricted_guest(void)
+static inline bool cpu_has_vmx_unrestricted_guest(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
-static inline int cpu_has_vmx_ple(void)
+static inline bool cpu_has_vmx_ple(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled && irqchip_in_kernel(kvm);
}
-static inline int cpu_has_vmx_vpid(void)
+static inline bool cpu_has_vmx_vpid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID;
}
-static inline int cpu_has_vmx_rdtscp(void)
+static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDTSCP;
}
-static inline int cpu_has_virtual_nmis(void)
+static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
@@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(EXCEPTION_BITMAP, eb);
}
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ unsigned i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ break;
+
+ if (i == m->nr)
+ return;
+ --m->nr;
+ m->guest[i] = m->guest[m->nr];
+ m->host[i] = m->host[m->nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val)
+{
+ unsigned i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ for (i = 0; i < m->nr; ++i)
+ if (m->guest[i].index == msr)
+ break;
+
+ if (i == m->nr) {
+ ++m->nr;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+ }
+
+ m->guest[i].index = msr;
+ m->guest[i].value = guest_val;
+ m->host[i].index = msr;
+ m->host[i].value = host_val;
+}
+
static void reload_tss(void)
{
/*
* VT restores TR but not its size. Useless.
*/
- struct descriptor_table gdt;
+ struct desc_ptr gdt;
struct desc_struct *descs;
- kvm_get_gdt(&gdt);
- descs = (void *)gdt.base;
+ native_store_gdt(&gdt);
+ descs = (void *)gdt.address;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
}
@@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer |= host_efer & ignore_bits;
vmx->guest_msrs[efer_offset].data = guest_efer;
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+ /* On ept, can't emulate nx, and must switch nx atomically */
+ if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+ guest_efer = vmx->vcpu.arch.efer;
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+ return false;
+ }
+
return true;
}
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_ptr gdt;
+ struct desc_struct *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (!(selector & ~3))
+ return 0;
+
+ native_store_gdt(&gdt);
+ table_base = gdt.address;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~3))
+ return 0;
+
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct desc_struct *)(table_base + (selector & ~7));
+ v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+ return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (vcpu->cpu != cpu) {
- struct descriptor_table dt;
+ struct desc_ptr dt;
unsigned long sysenter_esp;
vcpu->cpu = cpu;
@@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors.
*/
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- kvm_get_gdt(&dt);
- vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
+ native_store_gdt(&dt);
+ vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
- ret |= X86_SHADOW_INT_STI;
+ ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
- ret |= X86_SHADOW_INT_MOV_SS;
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
- if (mask & X86_SHADOW_INT_MOV_SS)
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
- if (mask & X86_SHADOW_INT_STI)
+ else if (mask & KVM_X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
@@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code)
+ bool has_error_code, u32 error_code,
+ bool reinject)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- return (msr & (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
- == FEATURE_CONTROL_LOCKED;
+ if (msr & FEATURE_CONTROL_LOCKED) {
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && tboot_enabled())
+ return 1;
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && !tboot_enabled())
+ return 1;
+ }
+
+ return 0;
/* locked but not enabled */
}
@@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old;
+ u64 old, test_bits;
if (read_cr4() & X86_CR4_VMXE)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
- if ((old & (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
- != (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
+
+ test_bits = FEATURE_CONTROL_LOCKED;
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ if (tboot_enabled())
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+ if ((old & test_bits) != test_bits) {
/* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
- FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED);
+ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+ }
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&phys_addr), "m"(phys_addr)
@@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
struct kvm_memslots *slots;
gfn_t base_gfn;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
base_gfn = kvm->memslots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
@@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
& ~VM_ENTRY_IA32E_MODE);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
}
#endif
@@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_IDTR_BASE);
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_IDTR_BASE, dt->base);
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_GDTR_BASE);
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_GDTR_BASE, dt->base);
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
}
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
+static void free_vpid(struct vcpu_vmx *vmx)
+{
+ if (!enable_vpid)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ if (vmx->vpid != 0)
+ __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
int f = sizeof(unsigned long);
@@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
u32 junk;
u64 host_pat, tsc_this, tsc_base;
unsigned long a;
- struct descriptor_table dt;
+ struct desc_ptr dt;
int i;
unsigned long kvm_vmx_return;
u32 exec_control;
@@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- kvm_get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
+ native_store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2947,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
int size, in, string;
unsigned port;
- ++vcpu->stat.io_exits;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
+ in = (exit_qualification & 8) != 0;
- if (string) {
- if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
+ ++vcpu->stat.io_exits;
- size = (exit_qualification & 7) + 1;
- in = (exit_qualification & 8) != 0;
- port = exit_qualification >> 16;
+ if (string || in)
+ return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
skip_emulated_instruction(vcpu);
- return kvm_emulate_pio(vcpu, in, size, port);
+
+ return kvm_fast_pio_out(vcpu, size, port);
}
static void
@@ -3053,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 0;
}
-static int check_dr_alias(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return -1;
- }
- return 0;
-}
-
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- unsigned long val;
int dr, reg;
/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3100,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- switch (dr) {
- case 0 ... 3:
- val = vcpu->arch.db[dr];
- break;
- case 4:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- case 6:
- val = vcpu->arch.dr6;
- break;
- case 5:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- default: /* 7 */
- val = vcpu->arch.dr7;
- break;
- }
- kvm_register_write(vcpu, reg, val);
- } else {
- val = vcpu->arch.regs[reg];
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- default: /* 7 */
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs =
- (val & DR7_BP_EN_MASK);
- }
- break;
- }
- }
+ unsigned long val;
+ if (!kvm_get_dr(vcpu, dr, &val))
+ kvm_register_write(vcpu, reg, val);
+ } else
+ kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
}
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
static int handle_cpuid(struct kvm_vcpu *vcpu)
{
kvm_emulate_cpuid(vcpu);
@@ -3292,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
u16 tss_selector;
int reason, type, idt_v;
@@ -3314,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
kvm_clear_interrupt_queue(vcpu);
break;
case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ /* fall through */
case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu);
break;
@@ -3328,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
type != INTR_TYPE_NMI_INTR))
skip_emulated_instruction(vcpu);
- if (!kvm_task_switch(vcpu, tss_selector, reason))
+ if (kvm_task_switch(vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
+ }
/* clear all local breakpoint enable flags */
vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3574,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+ trace_kvm_exit(exit_reason, vcpu);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3923,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
- spin_unlock(&vmx_vpid_lock);
+ free_vpid(vmx);
vmx_free_vmcs(vcpu);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
@@ -3988,6 +4059,7 @@ free_msrs:
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
+ free_vpid(vmx);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
@@ -4118,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
}
}
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -4154,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
@@ -4189,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cpuid_update = vmx_cpuid_update,
.rdtscp_supported = vmx_rdtscp_supported,
+
+ .set_supported_cpuid = vmx_set_supported_cpuid,
};
static int __init vmx_init(void)
@@ -4236,7 +4315,8 @@ static int __init vmx_init(void)
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
goto out3;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd9bc8f..05d571f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
kvm_on_user_return(&smsr->urn);
}
-unsigned long segment_base(u16 selector)
-{
- struct descriptor_table gdt;
- struct desc_struct *d;
- unsigned long table_base;
- unsigned long v;
-
- if (selector == 0)
- return 0;
-
- kvm_get_gdt(&gdt);
- table_base = gdt.base;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector = kvm_read_ldt();
-
- table_base = segment_base(ldt_selector);
- }
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
- return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
@@ -293,7 +265,8 @@ static int exception_class(int vector)
}
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code)
+ unsigned nr, bool has_error, u32 error_code,
+ bool reinject)
{
u32 prev_nr;
int class1, class2;
@@ -304,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.reinject = reinject;
return;
}
@@ -332,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0);
+ kvm_multiple_exception(vcpu, nr, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
@@ -352,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
@@ -476,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
kvm_x86_ops->set_cr0(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
kvm_mmu_reset_context(vcpu);
return;
@@ -485,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
@@ -517,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
- vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -592,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+ vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+ }
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[dr];
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ case 6:
+ *val = vcpu->arch.dr6;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
static inline u32 bit(int bitno)
{
return 1 << (bitno & 31);
@@ -606,9 +664,10 @@ static inline u32 bit(int bitno)
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 5
+#define KVM_SAVE_MSRS_BEGIN 7
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -625,48 +684,42 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & efer_reserved_bits) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (efer & efer_reserved_bits)
+ return 1;
if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+ return 1;
}
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+ return 1;
}
- kvm_x86_ops->set_efer(vcpu, efer);
-
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
+ kvm_x86_ops->set_efer(vcpu, efer);
+
vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
+
+ return 0;
}
void kvm_enable_efer_bits(u64 mask)
@@ -696,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
- static int version;
+ int version;
+ int r;
struct pvclock_wall_clock wc;
struct timespec boot;
if (!wall_clock)
return;
- version++;
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
@@ -796,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = ts.tv_nsec +
(NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+ vcpu->hv_clock.flags = 0;
+
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -1087,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
- set_efer(vcpu, data);
- break;
+ return set_efer(vcpu, data);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -1133,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
if (vcpu->arch.time_page) {
kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1408,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
+ case MSR_KVM_WALL_CLOCK_NEW:
data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
+ case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
case MSR_IA32_P5_MC_ADDR:
@@ -1549,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
r = 1;
break;
@@ -1769,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
{
int r;
+ vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1780,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
out:
cpuid->nent = vcpu->arch.cpuid_nent;
+ vcpu_put(vcpu);
return r;
}
@@ -1910,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case KVM_CPUID_SIGNATURE: {
+ char signature[12] = "KVMKVMKVM\0\0";
+ u32 *sigptr = (u32 *)signature;
+ entry->eax = 0;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
case 0x80000000:
entry->eax = min(entry->eax, 0x8000001a);
break;
@@ -1918,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_supported_word6_x86_features;
break;
}
+
+ kvm_x86_ops->set_supported_cpuid(function, entry);
+
put_cpu();
}
@@ -1953,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
+
+
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+ cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+ cpuid->nent);
+
r = -E2BIG;
if (nent >= cpuid->nent)
goto out_free;
@@ -2032,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
+ vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2046,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
out:
+ vcpu_put(vcpu);
return r;
}
@@ -2105,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
{
vcpu_load(vcpu);
- events->exception.injected = vcpu->arch.exception.pending;
+ events->exception.injected =
+ vcpu->arch.exception.pending &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
events->exception.error_code = vcpu->arch.exception.error_code;
- events->interrupt.injected = vcpu->arch.interrupt.pending;
+ events->interrupt.injected =
+ vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = vcpu->arch.interrupt.soft;
+ events->interrupt.soft = 0;
+ events->interrupt.shadow =
+ kvm_x86_ops->get_interrupt_shadow(vcpu,
+ KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2121,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->sipi_vector = vcpu->arch.sipi_vector;
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW);
vcpu_put(vcpu);
}
@@ -2130,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
vcpu_load(vcpu);
@@ -2145,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_ops->set_interrupt_shadow(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2159,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ vcpu_load(vcpu);
+
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ dbgregs->dr6 = vcpu->arch.dr6;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ dbgregs->flags = 0;
+
+ vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2313,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof mce))
goto out;
+ vcpu_load(vcpu);
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ vcpu_put(vcpu);
break;
}
case KVM_GET_VCPU_EVENTS: {
@@ -2337,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
break;
}
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -2390,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
struct kvm_mem_alias *alias;
struct kvm_mem_aliases *aliases;
- aliases = rcu_dereference(kvm->arch.aliases);
+ aliases = kvm_aliases(kvm);
for (i = 0; i < aliases->naliases; ++i) {
alias = &aliases->aliases[i];
@@ -2409,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
struct kvm_mem_alias *alias;
struct kvm_mem_aliases *aliases;
- aliases = rcu_dereference(kvm->arch.aliases);
+ aliases = kvm_aliases(kvm);
for (i = 0; i < aliases->naliases; ++i) {
alias = &aliases->aliases[i];
@@ -2804,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
+ r = -ENXIO;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
+ r = -EFAULT;
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
@@ -3024,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3104,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
}
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+ PFERR_WRITE_MASK, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -3134,7 +3327,6 @@ out:
return r;
}
-
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
@@ -3237,9 +3429,9 @@ mmio:
}
int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3257,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr,
}
EXPORT_SYMBOL_GPL(emulator_write_emulated);
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes == 8) {
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- u64 val;
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
- val = *(u64 *)new;
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- kaddr = kmap_atomic(page, KM_USER0);
- set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
+ kaddr = kmap_atomic(page, KM_USER0);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
}
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+ return X86EMUL_CONTINUE;
+
emul_write:
-#endif
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
return emulator_write_emulated(addr, new, bytes, vcpu);
}
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+ int r;
+
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.pio.count)
+ goto data_avail;
+
+ trace_kvm_pio(1, port, size, 1);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 1;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_IN;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+ const void *val, unsigned int count,
+ struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pio(0, port, size, 1);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 0;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3316,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
+ return kvm_get_dr(ctxt->vcpu, dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
+ return kvm_set_dr(ctxt->vcpu, dr, value & mask);
}
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3344,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
}
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = vcpu->arch.cr3;
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ kvm_set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ }
+}
+
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->get_cpl(vcpu);
+}
+
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(vcpu, &var, seg);
+
+ if (var.unusable)
+ return false;
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ /* needed to preserve selector */
+ kvm_get_segment(vcpu, &var, seg);
+
+ var.base = get_desc_base(desc);
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.present = desc->p;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
+ .write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_cached_descriptor = emulator_get_cached_descriptor,
+ .set_cached_descriptor = emulator_set_cached_descriptor,
+ .get_segment_selector = emulator_get_segment_selector,
+ .set_segment_selector = emulator_set_segment_selector,
+ .get_gdt = emulator_get_gdt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .set_rflags = emulator_set_rflags,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3380,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
cache_all_regs(vcpu);
vcpu->mmio_is_write = 0;
- vcpu->arch.pio.string = 0;
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
int cs_db, cs_l;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3396,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3428,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
return EMULATE_FAIL;
@@ -3439,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
+restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
if (r == 0)
kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
- if (vcpu->arch.pio.string)
+ if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
return EMULATE_DO_MMIO;
+ }
- if ((r || vcpu->mmio_is_write) && run) {
+ if (r || vcpu->mmio_is_write) {
run->exit_reason = KVM_EXIT_MMIO;
run->mmio.phys_addr = vcpu->mmio_phys_addr;
memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3458,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (r) {
if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
+ goto done;
if (!vcpu->mmio_needed) {
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
kvm_report_emulation_failure(vcpu, "mmio");
return EMULATE_FAIL;
}
return EMULATE_DO_MMIO;
}
- kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
if (vcpu->mmio_is_write) {
vcpu->mmio_needed = 0;
return EMULATE_DO_MMIO;
}
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
- void *p = vcpu->arch.pio_data;
- gva_t q = vcpu->arch.pio.guest_gva;
- unsigned bytes;
- int ret;
- u32 error_code;
-
- bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
- if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
- else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(vcpu, q, error_code);
-
- return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- long delta;
- int r;
- unsigned long val;
-
- if (!io->string) {
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(&val, vcpu->arch.pio_data, io->size);
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- }
- } else {
- if (io->in) {
- r = pio_copy_data(vcpu);
- if (r)
- goto out;
- }
-
- delta = 1;
- if (io->rep) {
- delta *= io->cur_count;
- /*
- * The size of the register should really depend on
- * current address size.
- */
- val = kvm_register_read(vcpu, VCPU_REGS_RCX);
- val -= delta;
- kvm_register_write(vcpu, VCPU_REGS_RCX, val);
- }
- if (io->down)
- delta = -delta;
- delta *= io->size;
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RDI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RDI, val);
- } else {
- val = kvm_register_read(vcpu, VCPU_REGS_RSI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RSI, val);
- }
- }
-out:
- io->count -= io->cur_count;
- io->cur_count = 0;
-
- return 0;
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
- /* TODO: String I/O for in kernel device */
- int r;
-
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- return r;
-}
+done:
+ if (vcpu->arch.exception.pending)
+ vcpu->arch.emulate_ctxt.restart = false;
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- void *pd = vcpu->arch.pio_data;
- int i, r = 0;
+ if (vcpu->arch.emulate_ctxt.restart)
+ goto restart;
- for (i = 0; i < io->cur_count; i++) {
- if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- io->port, io->size, pd)) {
- r = -EOPNOTSUPP;
- break;
- }
- pd += io->size;
- }
- return r;
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
- unsigned long val;
-
- trace_kvm_pio(!in, port, size, 1);
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 0;
- vcpu->arch.pio.down = 0;
- vcpu->arch.pio.rep = 0;
-
- if (!vcpu->arch.pio.in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
- }
-
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- complete_pio(vcpu);
- return 1;
- }
- return 0;
+ return EMULATE_DONE;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+EXPORT_SYMBOL_GPL(emulate_instruction);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port)
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
- unsigned now, in_page;
- int ret = 0;
-
- trace_kvm_pio(!in, port, size, count);
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 1;
- vcpu->arch.pio.down = down;
- vcpu->arch.pio.rep = rep;
-
- if (!count) {
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (!down)
- in_page = PAGE_SIZE - offset_in_page(address);
- else
- in_page = offset_in_page(address) + size;
- now = min(count, (unsigned long)in_page / size);
- if (!now)
- now = 1;
- if (down) {
- /*
- * String I/O in reverse. Yuck. Kill the guest, fix later.
- */
- pr_unimpl(vcpu, "guest string pio down\n");
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->run->io.count = now;
- vcpu->arch.pio.cur_count = now;
-
- if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- vcpu->arch.pio.guest_gva = address;
-
- if (!vcpu->arch.pio.in) {
- /* string PIO write */
- ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- return 1;
- if (ret == 0 && !pio_string_write(vcpu)) {
- complete_pio(vcpu);
- if (vcpu->arch.pio.count == 0)
- ret = 1;
- }
- }
- /* no string PIO read support yet */
-
+ unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+ /* do not return to emulator after return from userspace */
+ vcpu->arch.pio.count = 0;
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static void bounce_off(void *info)
{
@@ -3996,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
return emulator_write_emulated(rip, instruction, 3, vcpu);
}
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct descriptor_table dt = { limit, base };
+ struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct descriptor_table dt = { limit, base };
+ struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags)
-{
- kvm_lmsw(vcpu, msw);
- *rflags = kvm_get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
- unsigned long value;
-
- switch (cr) {
- case 0:
- value = kvm_read_cr0(vcpu);
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = vcpu->arch.cr3;
- break;
- case 4:
- value = kvm_read_cr4(vcpu);
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
- }
-
- return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
- unsigned long *rflags)
-{
- switch (cr) {
- case 0:
- kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
- *rflags = kvm_get_rflags(vcpu);
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- kvm_set_cr3(vcpu, val);
- break;
- case 4:
- kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
- break;
- case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- }
-}
-
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4138,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
+not_found:
return 36;
}
@@ -4254,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.reinject);
return;
}
@@ -4486,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -4514,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (!irqchip_in_kernel(vcpu->kvm))
kvm_set_cr8(vcpu, kvm_run->cr8);
- if (vcpu->arch.pio.cur_count) {
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = complete_pio(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r)
- goto out;
- }
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
-
+ if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+ vcpu->arch.emulate_ctxt.restart) {
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
+ }
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
- EMULTYPE_NO_DECODE);
+ r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r == EMULATE_DO_MMIO) {
- /*
- * Read-modify-write. Back to userspace.
- */
r = 0;
goto out;
}
@@ -4545,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
r = __vcpu_run(vcpu);
out:
+ post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -4616,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct kvm_segment cs;
@@ -4635,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
- struct descriptor_table dt;
+ struct desc_ptr dt;
vcpu_load(vcpu);
@@ -4650,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.limit;
- sregs->idt.base = dt.base;
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.limit;
- sregs->gdt.base = dt.base;
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
@@ -4693,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
- struct kvm_segment *kvm_desct)
-{
- kvm_desct->base = get_desc_base(seg_desc);
- kvm_desct->limit = get_desc_limit(seg_desc);
- if (seg_desc->g) {
- kvm_desct->limit <<= 12;
- kvm_desct->limit |= 0xfff;
- }
- kvm_desct->selector = selector;
- kvm_desct->type = seg_desc->type;
- kvm_desct->present = seg_desc->p;
- kvm_desct->dpl = seg_desc->dpl;
- kvm_desct->db = seg_desc->d;
- kvm_desct->s = seg_desc->s;
- kvm_desct->l = seg_desc->l;
- kvm_desct->g = seg_desc->g;
- kvm_desct->avl = seg_desc->avl;
- if (!selector)
- kvm_desct->unusable = 1;
- else
- kvm_desct->unusable = 0;
- kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
- u16 selector,
- struct descriptor_table *dtable)
-{
- if (selector & 1 << 2) {
- struct kvm_segment kvm_seg;
-
- kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
- if (kvm_seg.unusable)
- dtable->limit = 0;
- else
- dtable->limit = kvm_seg.limit;
- dtable->base = kvm_seg.base;
- }
- else
- kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
-{
- struct descriptor_table dtable;
- u16 index = selector >> 3;
- int ret;
- u32 err;
- gva_t addr;
-
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
- if (dtable.limit < index * 8 + 7) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
- addr = dtable.base + index * 8;
- ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
- vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(vcpu, addr, err);
-
- return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
-{
- struct descriptor_table dtable;
- u16 index = selector >> 3;
-
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
- if (dtable.limit < index * 8 + 7)
- return 1;
- return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
-}
-
-static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc)
-{
- u32 base_addr = get_desc_base(seg_desc);
-
- return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
-}
-
-static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc)
-{
- u32 base_addr = get_desc_base(seg_desc);
-
- return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment kvm_seg;
-
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
- struct kvm_segment segvar = {
- .base = selector << 4,
- .limit = 0xffff,
- .selector = selector,
- .type = 3,
- .present = 1,
- .dpl = 3,
- .db = 0,
- .s = 1,
- .l = 0,
- .g = 0,
- .avl = 0,
- .unusable = 0,
- };
- kvm_x86_ops->set_segment(vcpu, &segvar, seg);
- return X86EMUL_CONTINUE;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
{
- return (seg != VCPU_SREG_LDTR) &&
- (seg != VCPU_SREG_TR) &&
- (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
- struct kvm_segment kvm_seg;
- struct desc_struct seg_desc;
- u8 dpl, rpl, cpl;
- unsigned err_vec = GP_VECTOR;
- u32 err_code = 0;
- bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
- int ret;
+ int cs_db, cs_l, ret;
+ cache_all_regs(vcpu);
- if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
- return kvm_load_realmode_segment(vcpu, selector, seg);
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- /* TR should be in GDT only */
- if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
- goto exception;
+ ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+ tss_selector, reason, has_error_code,
+ error_code);
- ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
if (ret)
- return ret;
-
- seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
- if (null_selector) { /* for NULL selector skip all following checks */
- kvm_seg.unusable = 1;
- goto load;
- }
-
- err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
-
- /* can't load system descriptor into segment selecor */
- if (seg <= VCPU_SREG_GS && !kvm_seg.s)
- goto exception;
-
- if (!kvm_seg.present) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
- rpl = selector & 3;
- dpl = kvm_seg.dpl;
- cpl = kvm_x86_ops->get_cpl(vcpu);
-
- switch (seg) {
- case VCPU_SREG_SS:
- /*
- * segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
- */
- if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
- goto exception;
- break;
- case VCPU_SREG_CS:
- if (!(kvm_seg.type & 8))
- goto exception;
-
- if (kvm_seg.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
- goto exception;
- }
- /* CS(RPL) <- CPL */
- selector = (selector & 0xfffc) | cpl;
- break;
- case VCPU_SREG_TR:
- if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
- goto exception;
- break;
- case VCPU_SREG_LDTR:
- if (kvm_seg.s || kvm_seg.type != 2)
- goto exception;
- break;
- default: /* DS, ES, FS, or GS */
- /*
- * segment is not a data or readable code segment or
- * ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
- */
- if ((kvm_seg.type & 0xa) == 0x8 ||
- (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
- goto exception;
- break;
- }
-
- if (!kvm_seg.unusable && kvm_seg.s) {
- /* mark segment as accessed */
- kvm_seg.type |= 1;
- seg_desc.type |= 1;
- save_guest_segment_descriptor(vcpu, selector, &seg_desc);
- }
-load:
- kvm_set_segment(vcpu, &kvm_seg, seg);
- return X86EMUL_CONTINUE;
-exception:
- kvm_queue_exception_e(vcpu, err_vec, err_code);
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- tss->cr3 = vcpu->arch.cr3;
- tss->eip = kvm_rip_read(vcpu);
- tss->eflags = kvm_get_rflags(vcpu);
- tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
- tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
- tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
-{
- struct kvm_segment kvm_seg;
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- kvm_set_cr3(vcpu, tss->cr3);
-
- kvm_rip_write(vcpu, tss->eip);
- kvm_set_rflags(vcpu, tss->eflags | 2);
-
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
- kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
- kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
- kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
- kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
- kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
- kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
- return 1;
+ return EMULATE_FAIL;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
- return 1;
- return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- tss->ip = kvm_rip_read(vcpu);
- tss->flag = kvm_get_rflags(vcpu);
- tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- kvm_rip_write(vcpu, tss->ip);
- kvm_set_rflags(vcpu, tss->flag | 2);
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
- kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
- kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
- kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
- kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
- return 1;
- return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
-{
- struct tss_segment_16 tss_segment_16;
- int ret = 0;
-
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
-
- save_state_to_tss16(vcpu, &tss_segment_16);
-
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
-
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
- &tss_segment_16, sizeof tss_segment_16))
- goto out;
-
- if (old_tss_sel != 0xffff) {
- tss_segment_16.prev_task_link = old_tss_sel;
-
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr_write(vcpu, nseg_desc),
- &tss_segment_16.prev_task_link,
- sizeof tss_segment_16.prev_task_link))
- goto out;
- }
-
- if (load_state_from_tss16(vcpu, &tss_segment_16))
- goto out;
-
- ret = 1;
-out:
- return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
-{
- struct tss_segment_32 tss_segment_32;
- int ret = 0;
-
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
-
- save_state_to_tss32(vcpu, &tss_segment_32);
-
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
-
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
- &tss_segment_32, sizeof tss_segment_32))
- goto out;
-
- if (old_tss_sel != 0xffff) {
- tss_segment_32.prev_task_link = old_tss_sel;
-
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr_write(vcpu, nseg_desc),
- &tss_segment_32.prev_task_link,
- sizeof tss_segment_32.prev_task_link))
- goto out;
- }
-
- if (load_state_from_tss32(vcpu, &tss_segment_32))
- goto out;
-
- ret = 1;
-out:
- return ret;
-}
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
-{
- struct kvm_segment tr_seg;
- struct desc_struct cseg_desc;
- struct desc_struct nseg_desc;
- int ret = 0;
- u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
- u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
- u32 desc_limit;
-
- old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
- /* FIXME: Handle errors. Failure to read either TSS or their
- * descriptors should generate a pagefault.
- */
- if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
- goto out;
-
- if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
- goto out;
-
- if (reason != TASK_SWITCH_IRET) {
- int cpl;
-
- cpl = kvm_x86_ops->get_cpl(vcpu);
- if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return 1;
- }
- }
-
- desc_limit = get_desc_limit(&nseg_desc);
- if (!nseg_desc.p ||
- ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
- desc_limit < 0x2b)) {
- kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
- return 1;
- }
-
- if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
- cseg_desc.type &= ~(1 << 1); //clear the B flag
- save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
- }
-
- if (reason == TASK_SWITCH_IRET) {
- u32 eflags = kvm_get_rflags(vcpu);
- kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
- }
-
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
-
- if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
- else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
-
- if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
- u32 eflags = kvm_get_rflags(vcpu);
- kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
- }
-
- if (reason != TASK_SWITCH_IRET) {
- nseg_desc.type |= (1 << 1);
- save_guest_segment_descriptor(vcpu, tss_selector,
- &nseg_desc);
- }
-
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
- seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
- tr_seg.type = 11;
- kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
- return ret;
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5258,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int mmu_reset_needed = 0;
int pending_vec, max_bits;
- struct descriptor_table dt;
+ struct desc_ptr dt;
vcpu_load(vcpu);
- dt.limit = sregs->idt.limit;
- dt.base = sregs->idt.base;
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
- dt.limit = sregs->gdt.limit;
- dt.base = sregs->gdt.base;
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
@@ -5365,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- vcpu->arch.singlestep_cs =
- get_segment_selector(vcpu, VCPU_SREG_CS);
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -5860,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ unsigned long current_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
rflags = kvm_x86_ops->get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ rflags &= ~X86_EFLAGS_TF;
return rflags;
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5874,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- vcpu->arch.singlestep_cs ==
- get_segment_selector(vcpu, VCPU_SREG_CS) &&
- vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
- rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5893,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a4047..f4b5445 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
+static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
+{
+ return rcu_dereference_check(kvm->arch.aliases,
+ srcu_read_lock_held(&kvm->srcu)
+ || lockdep_is_held(&kvm->slots_lock));
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 60df9c8..23ea022 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -160,6 +160,7 @@ struct kvm_pit_config {
#define KVM_EXIT_DCR 15
#define KVM_EXIT_NMI 16
#define KVM_EXIT_INTERNAL_ERROR 17
+#define KVM_EXIT_OSI 18
/* For KVM_EXIT_INTERNAL_ERROR */
#define KVM_INTERNAL_ERROR_EMULATION 1
@@ -259,6 +260,10 @@ struct kvm_run {
__u32 ndata;
__u64 data[16];
} internal;
+ /* KVM_EXIT_OSI */
+ struct {
+ __u64 gprs[32];
+ } osi;
/* Fix the size of the union. */
char padding[256];
};
@@ -400,6 +405,15 @@ struct kvm_ioeventfd {
__u8 pad[36];
};
+/* for KVM_ENABLE_CAP */
+struct kvm_enable_cap {
+ /* in */
+ __u32 cap;
+ __u32 flags;
+ __u64 args[4];
+ __u8 pad[64];
+};
+
#define KVMIO 0xAE
/*
@@ -501,7 +515,15 @@ struct kvm_ioeventfd {
#define KVM_CAP_HYPERV_VAPIC 45
#define KVM_CAP_HYPERV_SPIN 46
#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
+#define KVM_CAP_PPC_OSI 52
+#define KVM_CAP_PPC_UNSET_IRQ 53
+#define KVM_CAP_ENABLE_CAP 54
#ifdef KVM_CAP_IRQ_ROUTING
@@ -688,6 +710,10 @@ struct kvm_clock_data {
/* Available with KVM_CAP_VCPU_EVENTS */
#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
+#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 169d077..7cb116a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -105,6 +105,12 @@ struct kvm_vcpu {
struct kvm_vcpu_arch arch;
};
+/*
+ * Some of the bitops functions do not support too long bitmaps.
+ * This number must be determined not to exceed such limits.
+ */
+#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
+
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
@@ -237,17 +243,23 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
struct module *module);
void kvm_exit(void);
void kvm_get_kvm(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+ return rcu_dereference_check(kvm->memslots,
+ srcu_read_lock_held(&kvm->srcu)
+ || lockdep_is_held(&kvm->slots_lock));
+}
+
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
extern struct page *bad_page;
extern pfn_t bad_pfn;
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index bf2a0c7..1dba6ee 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -150,6 +150,7 @@ extern int tboot_force_iommu(void);
#else
+#define tboot_enabled() 0
#define tboot_probe() do { } while (0)
#define tboot_shutdown(shutdown_type) do { } while (0)
#define tboot_sleep(sleep_state, pm1a_control, pm1b_control) \
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b17d49d..6dd3a51 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -5,7 +5,6 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_FILE kvm
#if defined(__KVM_HAVE_IOAPIC)
TRACE_EVENT(kvm_set_irq,
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 02ff2b1..4d10b1e 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -316,12 +316,16 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
kvm_assigned_dev_intr, 0,
"kvm_assigned_msix_device",
(void *)dev);
- /* FIXME: free requested_irq's on failure */
if (r)
- return r;
+ goto err;
}
return 0;
+err:
+ for (i -= 1; i >= 0; i--)
+ free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+ pci_disable_msix(dev->dev);
+ return r;
}
#endif
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 36e2580..5385017 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -120,8 +120,10 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
return ret;
out_free_dev:
+ kvm->coalesced_mmio_dev = NULL;
kfree(dev);
out_free_page:
+ kvm->coalesced_mmio_ring = NULL;
__free_page(page);
out_err:
return ret;
@@ -139,7 +141,7 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
if (dev == NULL)
- return -EINVAL;
+ return -ENXIO;
mutex_lock(&kvm->slots_lock);
if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
@@ -162,7 +164,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *z;
if (dev == NULL)
- return -EINVAL;
+ return -ENXIO;
mutex_lock(&kvm->slots_lock);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 11692b9..d2f06be 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -127,7 +127,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
int i, r = 0;
struct kvm_memslots *slots;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; i++) {
r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
@@ -286,7 +286,7 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
int i;
struct kvm_memslots *slots;
- slots = rcu_dereference(kvm->memslots);
+ slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; i++) {
kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c82ae24..f032806 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -422,9 +422,6 @@ static struct kvm *kvm_create_vm(void)
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- kvm_coalesced_mmio_init(kvm);
-#endif
out:
return kvm;
@@ -560,6 +557,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
+ r = -EINVAL;
+ if (npages > KVM_MEM_MAX_NR_PAGES)
+ goto out;
+
if (!npages)
mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
@@ -833,7 +834,7 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
{
int i;
- struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ struct kvm_memslots *slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; ++i) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -855,7 +856,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
- struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ struct kvm_memslots *slots = kvm_memslots(kvm);
gfn = unalias_gfn_instantiation(kvm, gfn);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
@@ -899,7 +900,7 @@ out:
int memslot_id(struct kvm *kvm, gfn_t gfn)
{
int i;
- struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = NULL;
gfn = unalias_gfn(kvm, gfn);
@@ -914,6 +915,11 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
return memslot - slots->memslots;
}
+static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
@@ -922,7 +928,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
slot = gfn_to_memslot_unaliased(kvm, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
- return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+ return gfn_to_hva_memslot(slot, gfn);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
@@ -972,11 +978,6 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
-static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
-{
- return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
-}
-
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
@@ -1190,13 +1191,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
memslot = gfn_to_memslot_unaliased(kvm, gfn);
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
- unsigned long *p = memslot->dirty_bitmap +
- rel_gfn / BITS_PER_LONG;
- int offset = rel_gfn % BITS_PER_LONG;
- /* avoid RMW */
- if (!generic_test_le_bit(offset, p))
- generic___set_le_bit(offset, p);
+ generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -1609,7 +1605,6 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
- r = -ENXIO;
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
if (r)
goto out;
@@ -1621,7 +1616,6 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
- r = -ENXIO;
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
if (r)
goto out;
@@ -1755,12 +1749,19 @@ static struct file_operations kvm_vm_fops = {
static int kvm_dev_ioctl_create_vm(void)
{
- int fd;
+ int fd, r;
struct kvm *kvm;
kvm = kvm_create_vm();
if (IS_ERR(kvm))
return PTR_ERR(kvm);
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ r = kvm_coalesced_mmio_init(kvm);
+ if (r < 0) {
+ kvm_put_kvm(kvm);
+ return r;
+ }
+#endif
fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (fd < 0)
kvm_put_kvm(kvm);
@@ -1928,11 +1929,6 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
cpu);
hardware_disable(NULL);
break;
- case CPU_UP_CANCELED:
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
- cpu);
- smp_call_function_single(cpu, hardware_disable, NULL, 1);
- break;
case CPU_ONLINE:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
@@ -1991,7 +1987,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, const void *val)
{
int i;
- struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+ struct kvm_io_bus *bus;
+
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
return 0;
@@ -2003,8 +2001,9 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, void *val)
{
int i;
- struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+ struct kvm_io_bus *bus;
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
return 0;
@@ -2179,7 +2178,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
kvm_arch_vcpu_put(vcpu);
}
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
struct module *module)
{
int r;
@@ -2229,8 +2228,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
goto out_free_4;
/* A kmem cache lets us meet the alignment requirements of fx_save. */
- kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
- __alignof__(struct kvm_vcpu),
+ if (!vcpu_align)
+ vcpu_align = __alignof__(struct kvm_vcpu);
+ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
0, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
@@ -2279,7 +2279,6 @@ EXPORT_SYMBOL_GPL(kvm_init);
void kvm_exit(void)
{
- tracepoint_synchronize_unregister();
kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);